2013-10-25 10:23:11 -07:00
|
|
|
/******************************************************************************
|
|
|
|
Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>
|
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
2013-12-02 21:24:38 -08:00
|
|
|
the Free Software Foundation, either version 2 of the License, or
|
2013-10-25 10:23:11 -07:00
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
******************************************************************************/
|
|
|
|
|
|
|
|
#include "format-conversion.h"
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
Revamp API and start using doxygen
The API used to be designed in such a way to where it would expect
exports for each individual source/output/encoder/etc. You would export
functions for each and it would automatically load those functions based
on a specific naming scheme from the module.
The idea behind this was that I wanted to limit the usage of structures
in the API so only functions could be used. It was an interesting idea
in theory, but this idea turned out to be flawed in a number of ways:
1.) Requiring exports to create sources/outputs/encoders/etc meant that
you could not create them by any other means, which meant that
things like faruton's .net plugin would become difficult.
2.) Export function declarations could not be checked, therefore if you
created a function with the wrong parameters and parameter types,
the compiler wouldn't know how to check for that.
3.) Required overly complex load functions in libobs just to handle it.
It makes much more sense to just have a load function that you call
manually. Complexity is the bane of all good programs.
4.) It required that you have functions of specific names, which looked
and felt somewhat unsightly.
So, to fix these issues, I replaced it with a more commonly used API
scheme, seen commonly in places like kernels and typical C libraries
with abstraction. You simply create a structure that contains the
callback definitions, and you pass it to a function to register that
definition (such as obs_register_source), which you call in the
obs_module_load of the module.
It will also automatically check the structure size and ensure that it
only loads the required values if the structure happened to add new
values in an API change.
The "main" source file for each module must include obs-module.h, and
must use OBS_DECLARE_MODULE() within that source file.
Also, started writing some doxygen documentation in to the main library
headers. Will add more detailed documentation as I go.
2014-02-12 07:04:50 -08:00
|
|
|
#define get_m128_32_0(val) (*((uint32_t*)&val))
|
|
|
|
#define get_m128_32_1(val) (*(((uint32_t*)&val)+1))
|
2013-10-25 10:23:11 -07:00
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_lum(uint8_t *lum_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t lum_pos0, uint32_t lum_pos1,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i lum_mask)
|
|
|
|
{
|
|
|
|
__m128i pack_val = _mm_packs_epi32(
|
|
|
|
_mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
|
|
|
|
_mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
|
|
|
|
pack_val = _mm_packus_epi16(pack_val, pack_val);
|
|
|
|
|
|
|
|
*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
|
|
|
|
*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_chroma_1plane(uint8_t *uv_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t chroma_pos,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i uv_mask)
|
|
|
|
{
|
|
|
|
__m128i add_val = _mm_add_epi64(
|
|
|
|
_mm_and_si128(line1, uv_mask),
|
|
|
|
_mm_and_si128(line2, uv_mask));
|
|
|
|
__m128i avg_val = _mm_add_epi64(
|
|
|
|
add_val,
|
|
|
|
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
|
|
avg_val = _mm_srai_epi16(avg_val, 2);
|
|
|
|
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_packus_epi16(avg_val, avg_val);
|
|
|
|
|
|
|
|
*(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
|
|
|
|
}
|
|
|
|
|
2014-02-09 06:59:00 -08:00
|
|
|
static FORCE_INLINE void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t chroma_pos,
|
|
|
|
const __m128i line1, const __m128i line2,
|
|
|
|
const __m128i uv_mask)
|
|
|
|
{
|
|
|
|
uint32_t packed_vals;
|
|
|
|
|
|
|
|
__m128i add_val = _mm_add_epi64(
|
|
|
|
_mm_and_si128(line1, uv_mask),
|
|
|
|
_mm_and_si128(line2, uv_mask));
|
|
|
|
__m128i avg_val = _mm_add_epi64(
|
|
|
|
add_val,
|
|
|
|
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
|
|
|
|
avg_val = _mm_srai_epi16(avg_val, 2);
|
|
|
|
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
|
|
|
|
avg_val = _mm_packus_epi16(avg_val, avg_val);
|
|
|
|
|
|
|
|
packed_vals = get_m128_32_0(avg_val);
|
|
|
|
|
|
|
|
*(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
|
|
|
|
*(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void compress_uyvx_to_i420(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output[], const uint32_t out_linesize[])
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint8_t *lum_plane = output[0];
|
|
|
|
uint8_t *u_plane = output[1];
|
|
|
|
uint8_t *v_plane = output[2];
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
|
|
|
|
__m128i uv_mask = _mm_set1_epi16(0x00FF);
|
|
|
|
|
|
|
|
for (y = start_y; y < end_y; y += 2) {
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t y_pos = y * in_linesize;
|
|
|
|
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
|
|
|
|
uint32_t lum_y_pos = y * out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t x;
|
|
|
|
|
|
|
|
for (x = 0; x < width; x += 4) {
|
|
|
|
const uint8_t *img = input + y_pos + x*4;
|
|
|
|
uint32_t lum_pos0 = lum_y_pos + x;
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
__m128i line1 = _mm_load_si128((const __m128i*)img);
|
|
|
|
__m128i line2 = _mm_load_si128(
|
2014-02-09 04:51:06 -08:00
|
|
|
(const __m128i*)(img + in_linesize));
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
pack_lum(lum_plane, lum_pos0, lum_pos1,
|
|
|
|
line1, line2, lum_mask);
|
|
|
|
pack_chroma_2plane(u_plane, v_plane,
|
|
|
|
chroma_y_pos + (x>>1),
|
|
|
|
line1, line2, uv_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void compress_uyvx_to_nv12(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output[], const uint32_t out_linesize[])
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint8_t *lum_plane = output[0];
|
|
|
|
uint8_t *chroma_plane = output[1];
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
|
|
|
|
__m128i uv_mask = _mm_set1_epi16(0x00FF);
|
|
|
|
|
|
|
|
for (y = start_y; y < end_y; y += 2) {
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t y_pos = y * in_linesize;
|
|
|
|
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
|
|
|
|
uint32_t lum_y_pos = y * out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t x;
|
|
|
|
|
|
|
|
for (x = 0; x < width; x += 4) {
|
|
|
|
const uint8_t *img = input + y_pos + x*4;
|
|
|
|
uint32_t lum_pos0 = lum_y_pos + x;
|
2014-02-09 04:51:06 -08:00
|
|
|
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
__m128i line1 = _mm_load_si128((const __m128i*)img);
|
|
|
|
__m128i line2 = _mm_load_si128(
|
2014-02-09 04:51:06 -08:00
|
|
|
(const __m128i*)(img + in_linesize));
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
pack_lum(lum_plane, lum_pos0, lum_pos1,
|
|
|
|
line1, line2, lum_mask);
|
|
|
|
pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
|
|
|
|
line1, line2, uv_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_420(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *const input[], const uint32_t in_linesize[],
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
|
|
|
uint32_t start_y_d2 = start_y/2;
|
|
|
|
uint32_t width_d2 = width/2;
|
|
|
|
uint32_t height_d2 = end_y/2;
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
for (y = start_y_d2; y < height_d2; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *chroma0 = input[1] + y * in_linesize[1];
|
|
|
|
const uint8_t *chroma1 = input[2] + y * in_linesize[2];
|
2013-10-25 10:23:11 -07:00
|
|
|
register const uint8_t *lum0, *lum1;
|
|
|
|
register uint32_t *output0, *output1;
|
|
|
|
uint32_t x;
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
lum0 = input[0] + y * 2*width;
|
2013-10-25 10:23:11 -07:00
|
|
|
lum1 = lum0 + width;
|
2014-02-09 04:51:06 -08:00
|
|
|
output0 = (uint32_t*)(output + y * 2 * in_linesize[0]);
|
|
|
|
output1 = (uint32_t*)((uint8_t*)output0 + in_linesize[0]);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
for (x = 0; x < width_d2; x++) {
|
|
|
|
uint32_t out;
|
|
|
|
out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
|
|
|
|
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_nv12(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *const input[], const uint32_t in_linesize[],
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize)
|
2013-10-26 14:32:06 -07:00
|
|
|
{
|
|
|
|
uint32_t start_y_d2 = start_y/2;
|
|
|
|
uint32_t width_d2 = width/2;
|
|
|
|
uint32_t height_d2 = end_y/2;
|
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
for (y = start_y_d2; y < height_d2; y++) {
|
2014-02-07 02:03:54 -08:00
|
|
|
const uint16_t *chroma;
|
2013-10-26 14:32:06 -07:00
|
|
|
register const uint8_t *lum0, *lum1;
|
|
|
|
register uint32_t *output0, *output1;
|
|
|
|
uint32_t x;
|
|
|
|
|
2014-02-09 04:51:06 -08:00
|
|
|
chroma = (const uint16_t*)(input[1] + y * in_linesize[1]);
|
|
|
|
lum0 = input[0] + y*2 * in_linesize[0];
|
|
|
|
lum1 = lum0 + in_linesize[0];
|
|
|
|
output0 = (uint32_t*)(output + y*2 * out_linesize);
|
|
|
|
output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
|
2013-10-26 14:32:06 -07:00
|
|
|
|
|
|
|
for (x = 0; x < width_d2; x++) {
|
|
|
|
uint32_t out = *(chroma++) << 8;
|
|
|
|
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
*(output0++) = *(lum0++) | out;
|
|
|
|
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
*(output1++) = *(lum1++) | out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 02:03:54 -08:00
|
|
|
void decompress_422(
|
2014-02-09 04:51:06 -08:00
|
|
|
const uint8_t *input, uint32_t in_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width, uint32_t height,
|
|
|
|
uint32_t start_y, uint32_t end_y,
|
2014-02-09 04:51:06 -08:00
|
|
|
uint8_t *output, uint32_t out_linesize,
|
2014-02-07 02:03:54 -08:00
|
|
|
bool leading_lum)
|
2013-10-25 10:23:11 -07:00
|
|
|
{
|
2014-02-07 02:03:54 -08:00
|
|
|
uint32_t width_d2 = width >> 1;
|
2013-10-25 10:23:11 -07:00
|
|
|
uint32_t y;
|
|
|
|
|
|
|
|
register const uint32_t *input32;
|
|
|
|
register const uint32_t *input32_end;
|
|
|
|
register uint32_t *output32;
|
|
|
|
|
|
|
|
if (leading_lum) {
|
2013-10-26 14:32:06 -07:00
|
|
|
for (y = start_y; y < end_y; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
input32 = (const uint32_t*)(input + y*in_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
input32_end = input32 + width_d2;
|
2014-02-09 04:51:06 -08:00
|
|
|
output32 = (uint32_t*)(output + y*out_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
while(input32 < input32_end) {
|
|
|
|
register uint32_t dw = *input32;
|
|
|
|
|
|
|
|
output32[0] = dw;
|
|
|
|
dw &= 0xFFFFFF00;
|
|
|
|
dw |= (uint8_t)(dw>>16);
|
|
|
|
output32[1] = dw;
|
|
|
|
|
|
|
|
output32 += 2;
|
|
|
|
input32++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2013-10-26 14:32:06 -07:00
|
|
|
for (y = start_y; y < end_y; y++) {
|
2014-02-09 04:51:06 -08:00
|
|
|
input32 = (const uint32_t*)(input + y*in_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
input32_end = input32 + width_d2;
|
2014-02-09 04:51:06 -08:00
|
|
|
output32 = (uint32_t*)(output + y*out_linesize);
|
2013-10-25 10:23:11 -07:00
|
|
|
|
|
|
|
while (input32 < input32_end) {
|
|
|
|
register uint32_t dw = *input32;
|
|
|
|
|
|
|
|
output32[0] = dw;
|
|
|
|
dw &= 0xFFFF00FF;
|
|
|
|
dw |= (dw>>16) & 0xFF00;
|
|
|
|
output32[1] = dw;
|
|
|
|
|
|
|
|
output32 += 2;
|
|
|
|
input32++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|