1375 lines
42 KiB
C++
1375 lines
42 KiB
C++
/*
|
|
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "libyuv/row.h"
|
|
#include "libyuv/scale_row.h"
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
// This module is for 32 bit Visual C x86 and clangcl
|
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
|
|
|
// Offsets for source bytes 0 to 9
|
|
static uvec8 kShuf0 =
|
|
{ 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
|
|
static uvec8 kShuf1 =
|
|
{ 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
|
static uvec8 kShuf2 =
|
|
{ 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
// Offsets for source bytes 0 to 10
|
|
static uvec8 kShuf01 =
|
|
{ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
|
|
|
|
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
|
|
static uvec8 kShuf11 =
|
|
{ 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
|
|
|
|
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
|
|
static uvec8 kShuf21 =
|
|
{ 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
|
|
|
|
// Coefficients for source bytes 0 to 10
|
|
static uvec8 kMadd01 =
|
|
{ 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
|
|
|
|
// Coefficients for source bytes 10 to 21
|
|
static uvec8 kMadd11 =
|
|
{ 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
|
|
|
|
// Coefficients for source bytes 21 to 31
|
|
static uvec8 kMadd21 =
|
|
{ 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
|
|
|
|
// Coefficients for source bytes 21 to 31
|
|
static vec16 kRound34 =
|
|
{ 2, 2, 2, 2, 2, 2, 2, 2 };
|
|
|
|
static uvec8 kShuf38a =
|
|
{ 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
static uvec8 kShuf38b =
|
|
{ 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
|
|
|
|
// Arrange words 0,3,6 into 0,1,2
|
|
static uvec8 kShufAc =
|
|
{ 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
// Arrange words 0,3,6 into 3,4,5
|
|
static uvec8 kShufAc3 =
|
|
{ 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
|
|
|
|
// Scaling values for boxes of 3x3 and 2x3
|
|
static uvec16 kScaleAc33 =
|
|
{ 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
|
|
|
|
// Arrange first value for pixels 0,1,2,3,4,5
|
|
static uvec8 kShufAb0 =
|
|
{ 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
|
|
|
|
// Arrange second value for pixels 0,1,2,3,4,5
|
|
static uvec8 kShufAb1 =
|
|
{ 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
|
|
|
|
// Arrange third value for pixels 0,1,2,3,4,5
|
|
static uvec8 kShufAb2 =
|
|
{ 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
|
|
|
|
// Scaling values for boxes of 3x2 and 2x2
|
|
static uvec16 kScaleAb2 =
|
|
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
|
|
|
|
// Reads 32 pixels, throws half away and writes 16 pixels.
|
|
__declspec(naked)
|
|
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
psrlw xmm0, 8 // isolate odd pixels.
|
|
psrlw xmm1, 8
|
|
packuswb xmm0, xmm1
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 16
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 32x1 rectangle to 16x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
|
psrlw xmm4, 15
|
|
packuswb xmm4, xmm4
|
|
pxor xmm5, xmm5 // constant 0
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
pmaddubsw xmm0, xmm4 // horizontal add
|
|
pmaddubsw xmm1, xmm4
|
|
pavgw xmm0, xmm5 // (x + 1) / 2
|
|
pavgw xmm1, xmm5
|
|
packuswb xmm0, xmm1
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 16
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 32x2 rectangle to 16x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
|
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
|
psrlw xmm4, 15
|
|
packuswb xmm4, xmm4
|
|
pxor xmm5, xmm5 // constant 0
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
movdqu xmm2, [eax + esi]
|
|
movdqu xmm3, [eax + esi + 16]
|
|
lea eax, [eax + 32]
|
|
pmaddubsw xmm0, xmm4 // horizontal add
|
|
pmaddubsw xmm1, xmm4
|
|
pmaddubsw xmm2, xmm4
|
|
pmaddubsw xmm3, xmm4
|
|
paddw xmm0, xmm2 // vertical add
|
|
paddw xmm1, xmm3
|
|
psrlw xmm0, 1
|
|
psrlw xmm1, 1
|
|
pavgw xmm0, xmm5 // (x + 1) / 2
|
|
pavgw xmm1, xmm5
|
|
packuswb xmm0, xmm1
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 16
|
|
jg wloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
#ifdef HAS_SCALEROWDOWN2_AVX2
|
|
// Reads 64 pixels, throws half away and writes 32 pixels.
|
|
__declspec(naked)
|
|
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
wloop:
|
|
vmovdqu ymm0, [eax]
|
|
vmovdqu ymm1, [eax + 32]
|
|
lea eax, [eax + 64]
|
|
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
|
|
vpsrlw ymm1, ymm1, 8
|
|
vpackuswb ymm0, ymm0, ymm1
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vmovdqu [edx], ymm0
|
|
lea edx, [edx + 32]
|
|
sub ecx, 32
|
|
jg wloop
|
|
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 64x1 rectangle to 32x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
|
vpsrlw ymm4, ymm4, 15
|
|
vpackuswb ymm4, ymm4, ymm4
|
|
vpxor ymm5, ymm5, ymm5 // constant 0
|
|
|
|
wloop:
|
|
vmovdqu ymm0, [eax]
|
|
vmovdqu ymm1, [eax + 32]
|
|
lea eax, [eax + 64]
|
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
|
vpmaddubsw ymm1, ymm1, ymm4
|
|
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
|
vpavgw ymm1, ymm1, ymm5
|
|
vpackuswb ymm0, ymm0, ymm1
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vmovdqu [edx], ymm0
|
|
lea edx, [edx + 32]
|
|
sub ecx, 32
|
|
jg wloop
|
|
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
|
|
// For rounding, average = (sum + 2) / 4
|
|
// becomes average((sum >> 1), 0)
|
|
// Blends 64x2 rectangle to 32x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
|
|
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
|
|
vpsrlw ymm4, ymm4, 15
|
|
vpackuswb ymm4, ymm4, ymm4
|
|
vpxor ymm5, ymm5, ymm5 // constant 0
|
|
|
|
wloop:
|
|
vmovdqu ymm0, [eax]
|
|
vmovdqu ymm1, [eax + 32]
|
|
vmovdqu ymm2, [eax + esi]
|
|
vmovdqu ymm3, [eax + esi + 32]
|
|
lea eax, [eax + 64]
|
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
|
vpmaddubsw ymm1, ymm1, ymm4
|
|
vpmaddubsw ymm2, ymm2, ymm4
|
|
vpmaddubsw ymm3, ymm3, ymm4
|
|
vpaddw ymm0, ymm0, ymm2 // vertical add
|
|
vpaddw ymm1, ymm1, ymm3
|
|
vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
|
|
vpsrlw ymm1, ymm1, 1
|
|
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
|
vpavgw ymm1, ymm1, ymm5
|
|
vpackuswb ymm0, ymm0, ymm1
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vmovdqu [edx], ymm0
|
|
lea edx, [edx + 32]
|
|
sub ecx, 32
|
|
jg wloop
|
|
|
|
pop esi
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
#endif // HAS_SCALEROWDOWN2_AVX2
|
|
|
|
// Point samples 32 pixels to 8 pixels.
|
|
__declspec(naked)
|
|
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
|
|
psrld xmm5, 24
|
|
pslld xmm5, 16
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
pand xmm0, xmm5
|
|
pand xmm1, xmm5
|
|
packuswb xmm0, xmm1
|
|
psrlw xmm0, 8
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx], xmm0
|
|
lea edx, [edx + 8]
|
|
sub ecx, 8
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 32x4 rectangle to 8x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
push edi
|
|
mov eax, [esp + 8 + 4] // src_ptr
|
|
mov esi, [esp + 8 + 8] // src_stride
|
|
mov edx, [esp + 8 + 12] // dst_ptr
|
|
mov ecx, [esp + 8 + 16] // dst_width
|
|
lea edi, [esi + esi * 2] // src_stride * 3
|
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
|
psrlw xmm4, 15
|
|
movdqa xmm5, xmm4
|
|
packuswb xmm4, xmm4
|
|
psllw xmm5, 3 // constant 0x0008
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax] // average rows
|
|
movdqu xmm1, [eax + 16]
|
|
movdqu xmm2, [eax + esi]
|
|
movdqu xmm3, [eax + esi + 16]
|
|
pmaddubsw xmm0, xmm4 // horizontal add
|
|
pmaddubsw xmm1, xmm4
|
|
pmaddubsw xmm2, xmm4
|
|
pmaddubsw xmm3, xmm4
|
|
paddw xmm0, xmm2 // vertical add rows 0, 1
|
|
paddw xmm1, xmm3
|
|
movdqu xmm2, [eax + esi * 2]
|
|
movdqu xmm3, [eax + esi * 2 + 16]
|
|
pmaddubsw xmm2, xmm4
|
|
pmaddubsw xmm3, xmm4
|
|
paddw xmm0, xmm2 // add row 2
|
|
paddw xmm1, xmm3
|
|
movdqu xmm2, [eax + edi]
|
|
movdqu xmm3, [eax + edi + 16]
|
|
lea eax, [eax + 32]
|
|
pmaddubsw xmm2, xmm4
|
|
pmaddubsw xmm3, xmm4
|
|
paddw xmm0, xmm2 // add row 3
|
|
paddw xmm1, xmm3
|
|
phaddw xmm0, xmm1
|
|
paddw xmm0, xmm5 // + 8 for round
|
|
psrlw xmm0, 4 // /16 for average of 4 * 4
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx], xmm0
|
|
lea edx, [edx + 8]
|
|
sub ecx, 8
|
|
jg wloop
|
|
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
#ifdef HAS_SCALEROWDOWN4_AVX2
|
|
// Point samples 64 pixels to 16 pixels.
|
|
__declspec(naked)
|
|
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
|
|
vpsrld ymm5, ymm5, 24
|
|
vpslld ymm5, ymm5, 16
|
|
|
|
wloop:
|
|
vmovdqu ymm0, [eax]
|
|
vmovdqu ymm1, [eax + 32]
|
|
lea eax, [eax + 64]
|
|
vpand ymm0, ymm0, ymm5
|
|
vpand ymm1, ymm1, ymm5
|
|
vpackuswb ymm0, ymm0, ymm1
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vpsrlw ymm0, ymm0, 8
|
|
vpackuswb ymm0, ymm0, ymm0
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vmovdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 16
|
|
jg wloop
|
|
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 64x4 rectangle to 16x1.
|
|
__declspec(naked)
|
|
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
push edi
|
|
mov eax, [esp + 8 + 4] // src_ptr
|
|
mov esi, [esp + 8 + 8] // src_stride
|
|
mov edx, [esp + 8 + 12] // dst_ptr
|
|
mov ecx, [esp + 8 + 16] // dst_width
|
|
lea edi, [esi + esi * 2] // src_stride * 3
|
|
vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
|
|
vpsrlw ymm4, ymm4, 15
|
|
vpsllw ymm5, ymm4, 3 // constant 0x0008
|
|
vpackuswb ymm4, ymm4, ymm4
|
|
|
|
wloop:
|
|
vmovdqu ymm0, [eax] // average rows
|
|
vmovdqu ymm1, [eax + 32]
|
|
vmovdqu ymm2, [eax + esi]
|
|
vmovdqu ymm3, [eax + esi + 32]
|
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
|
vpmaddubsw ymm1, ymm1, ymm4
|
|
vpmaddubsw ymm2, ymm2, ymm4
|
|
vpmaddubsw ymm3, ymm3, ymm4
|
|
vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
|
|
vpaddw ymm1, ymm1, ymm3
|
|
vmovdqu ymm2, [eax + esi * 2]
|
|
vmovdqu ymm3, [eax + esi * 2 + 32]
|
|
vpmaddubsw ymm2, ymm2, ymm4
|
|
vpmaddubsw ymm3, ymm3, ymm4
|
|
vpaddw ymm0, ymm0, ymm2 // add row 2
|
|
vpaddw ymm1, ymm1, ymm3
|
|
vmovdqu ymm2, [eax + edi]
|
|
vmovdqu ymm3, [eax + edi + 32]
|
|
lea eax, [eax + 64]
|
|
vpmaddubsw ymm2, ymm2, ymm4
|
|
vpmaddubsw ymm3, ymm3, ymm4
|
|
vpaddw ymm0, ymm0, ymm2 // add row 3
|
|
vpaddw ymm1, ymm1, ymm3
|
|
vphaddw ymm0, ymm0, ymm1 // mutates
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
|
|
vpaddw ymm0, ymm0, ymm5 // + 8 for round
|
|
vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
|
|
vpackuswb ymm0, ymm0, ymm0
|
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
vmovdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 16
|
|
jg wloop
|
|
|
|
pop edi
|
|
pop esi
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
#endif // HAS_SCALEROWDOWN4_AVX2
|
|
|
|
// Point samples 32 pixels to 24 pixels.
|
|
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
|
// Then shuffled to do the scaling.
|
|
|
|
__declspec(naked)
|
|
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
movdqa xmm3, xmmword ptr kShuf0
|
|
movdqa xmm4, xmmword ptr kShuf1
|
|
movdqa xmm5, xmmword ptr kShuf2
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
movdqa xmm2, xmm1
|
|
palignr xmm1, xmm0, 8
|
|
pshufb xmm0, xmm3
|
|
pshufb xmm1, xmm4
|
|
pshufb xmm2, xmm5
|
|
movq qword ptr [edx], xmm0
|
|
movq qword ptr [edx + 8], xmm1
|
|
movq qword ptr [edx + 16], xmm2
|
|
lea edx, [edx + 24]
|
|
sub ecx, 24
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 32x2 rectangle to 24x1
|
|
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
|
|
// Then shuffled to do the scaling.
|
|
|
|
// Register usage:
|
|
// xmm0 src_row 0
|
|
// xmm1 src_row 1
|
|
// xmm2 shuf 0
|
|
// xmm3 shuf 1
|
|
// xmm4 shuf 2
|
|
// xmm5 madd 0
|
|
// xmm6 madd 1
|
|
// xmm7 kRound34
|
|
|
|
// Note that movdqa+palign may be better than movdqu.
|
|
__declspec(naked)
|
|
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
movdqa xmm2, xmmword ptr kShuf01
|
|
movdqa xmm3, xmmword ptr kShuf11
|
|
movdqa xmm4, xmmword ptr kShuf21
|
|
movdqa xmm5, xmmword ptr kMadd01
|
|
movdqa xmm6, xmmword ptr kMadd11
|
|
movdqa xmm7, xmmword ptr kRound34
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax] // pixels 0..7
|
|
movdqu xmm1, [eax + esi]
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm2
|
|
pmaddubsw xmm0, xmm5
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx], xmm0
|
|
movdqu xmm0, [eax + 8] // pixels 8..15
|
|
movdqu xmm1, [eax + esi + 8]
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm3
|
|
pmaddubsw xmm0, xmm6
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx + 8], xmm0
|
|
movdqu xmm0, [eax + 16] // pixels 16..23
|
|
movdqu xmm1, [eax + esi + 16]
|
|
lea eax, [eax + 32]
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm4
|
|
movdqa xmm1, xmmword ptr kMadd21
|
|
pmaddubsw xmm0, xmm1
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx + 16], xmm0
|
|
lea edx, [edx + 24]
|
|
sub ecx, 24
|
|
jg wloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Note that movdqa+palign may be better than movdqu.
|
|
__declspec(naked)
|
|
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
movdqa xmm2, xmmword ptr kShuf01
|
|
movdqa xmm3, xmmword ptr kShuf11
|
|
movdqa xmm4, xmmword ptr kShuf21
|
|
movdqa xmm5, xmmword ptr kMadd01
|
|
movdqa xmm6, xmmword ptr kMadd11
|
|
movdqa xmm7, xmmword ptr kRound34
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax] // pixels 0..7
|
|
movdqu xmm1, [eax + esi]
|
|
pavgb xmm1, xmm0
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm2
|
|
pmaddubsw xmm0, xmm5
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx], xmm0
|
|
movdqu xmm0, [eax + 8] // pixels 8..15
|
|
movdqu xmm1, [eax + esi + 8]
|
|
pavgb xmm1, xmm0
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm3
|
|
pmaddubsw xmm0, xmm6
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx + 8], xmm0
|
|
movdqu xmm0, [eax + 16] // pixels 16..23
|
|
movdqu xmm1, [eax + esi + 16]
|
|
lea eax, [eax + 32]
|
|
pavgb xmm1, xmm0
|
|
pavgb xmm0, xmm1
|
|
pshufb xmm0, xmm4
|
|
movdqa xmm1, xmmword ptr kMadd21
|
|
pmaddubsw xmm0, xmm1
|
|
paddsw xmm0, xmm7
|
|
psrlw xmm0, 2
|
|
packuswb xmm0, xmm0
|
|
movq qword ptr [edx + 16], xmm0
|
|
lea edx, [edx+24]
|
|
sub ecx, 24
|
|
jg wloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// 3/8 point sampler
|
|
|
|
// Scale 32 pixels to 12
|
|
__declspec(naked)
|
|
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_ptr
|
|
mov ecx, [esp + 16] // dst_width
|
|
movdqa xmm4, xmmword ptr kShuf38a
|
|
movdqa xmm5, xmmword ptr kShuf38b
|
|
|
|
xloop:
|
|
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
|
movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
|
lea eax, [eax + 32]
|
|
pshufb xmm0, xmm4
|
|
pshufb xmm1, xmm5
|
|
paddusb xmm0, xmm1
|
|
|
|
movq qword ptr [edx], xmm0 // write 12 pixels
|
|
movhlps xmm1, xmm0
|
|
movd [edx + 8], xmm1
|
|
lea edx, [edx + 12]
|
|
sub ecx, 12
|
|
jg xloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Scale 16x3 pixels to 6x1 with interpolation
|
|
__declspec(naked)
|
|
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
movdqa xmm2, xmmword ptr kShufAc
|
|
movdqa xmm3, xmmword ptr kShufAc3
|
|
movdqa xmm4, xmmword ptr kScaleAc33
|
|
pxor xmm5, xmm5
|
|
|
|
xloop:
|
|
movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
|
|
movdqu xmm6, [eax + esi]
|
|
movhlps xmm1, xmm0
|
|
movhlps xmm7, xmm6
|
|
punpcklbw xmm0, xmm5
|
|
punpcklbw xmm1, xmm5
|
|
punpcklbw xmm6, xmm5
|
|
punpcklbw xmm7, xmm5
|
|
paddusw xmm0, xmm6
|
|
paddusw xmm1, xmm7
|
|
movdqu xmm6, [eax + esi * 2]
|
|
lea eax, [eax + 16]
|
|
movhlps xmm7, xmm6
|
|
punpcklbw xmm6, xmm5
|
|
punpcklbw xmm7, xmm5
|
|
paddusw xmm0, xmm6
|
|
paddusw xmm1, xmm7
|
|
|
|
movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
|
|
psrldq xmm0, 2
|
|
paddusw xmm6, xmm0
|
|
psrldq xmm0, 2
|
|
paddusw xmm6, xmm0
|
|
pshufb xmm6, xmm2
|
|
|
|
movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
|
|
psrldq xmm1, 2
|
|
paddusw xmm7, xmm1
|
|
psrldq xmm1, 2
|
|
paddusw xmm7, xmm1
|
|
pshufb xmm7, xmm3
|
|
paddusw xmm6, xmm7
|
|
|
|
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
|
|
packuswb xmm6, xmm6
|
|
|
|
movd [edx], xmm6 // write 6 pixels
|
|
psrlq xmm6, 16
|
|
movd [edx + 2], xmm6
|
|
lea edx, [edx + 6]
|
|
sub ecx, 6
|
|
jg xloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Scale 16x2 pixels to 6x1 with interpolation
|
|
__declspec(naked)
|
|
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_ptr, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_ptr
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_ptr
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
movdqa xmm2, xmmword ptr kShufAb0
|
|
movdqa xmm3, xmmword ptr kShufAb1
|
|
movdqa xmm4, xmmword ptr kShufAb2
|
|
movdqa xmm5, xmmword ptr kScaleAb2
|
|
|
|
xloop:
|
|
movdqu xmm0, [eax] // average 2 rows into xmm0
|
|
movdqu xmm1, [eax + esi]
|
|
lea eax, [eax + 16]
|
|
pavgb xmm0, xmm1
|
|
|
|
movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
|
|
pshufb xmm1, xmm2
|
|
movdqa xmm6, xmm0
|
|
pshufb xmm6, xmm3
|
|
paddusw xmm1, xmm6
|
|
pshufb xmm0, xmm4
|
|
paddusw xmm1, xmm0
|
|
|
|
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
|
|
packuswb xmm1, xmm1
|
|
|
|
movd [edx], xmm1 // write 6 pixels
|
|
psrlq xmm1, 16
|
|
movd [edx + 2], xmm1
|
|
lea edx, [edx + 6]
|
|
sub ecx, 6
|
|
jg xloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
|
__declspec(naked)
|
|
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
mov edx, [esp + 8] // dst_ptr
|
|
mov ecx, [esp + 12] // src_width
|
|
pxor xmm5, xmm5
|
|
|
|
// sum rows
|
|
xloop:
|
|
movdqu xmm3, [eax] // read 16 bytes
|
|
lea eax, [eax + 16]
|
|
movdqu xmm0, [edx] // read 16 words from destination
|
|
movdqu xmm1, [edx + 16]
|
|
movdqa xmm2, xmm3
|
|
punpcklbw xmm2, xmm5
|
|
punpckhbw xmm3, xmm5
|
|
paddusw xmm0, xmm2 // sum 16 words
|
|
paddusw xmm1, xmm3
|
|
movdqu [edx], xmm0 // write 16 words to destination
|
|
movdqu [edx + 16], xmm1
|
|
lea edx, [edx + 32]
|
|
sub ecx, 16
|
|
jg xloop
|
|
ret
|
|
}
|
|
}
|
|
|
|
#ifdef HAS_SCALEADDROW_AVX2
|
|
// Reads 32 bytes and accumulates to 32 shorts at a time.
|
|
__declspec(naked)
|
|
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_ptr
|
|
mov edx, [esp + 8] // dst_ptr
|
|
mov ecx, [esp + 12] // src_width
|
|
vpxor ymm5, ymm5, ymm5
|
|
|
|
// sum rows
|
|
xloop:
|
|
vmovdqu ymm3, [eax] // read 32 bytes
|
|
lea eax, [eax + 32]
|
|
vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
|
|
vpunpcklbw ymm2, ymm3, ymm5
|
|
vpunpckhbw ymm3, ymm3, ymm5
|
|
vpaddusw ymm0, ymm2, [edx] // sum 16 words
|
|
vpaddusw ymm1, ymm3, [edx + 32]
|
|
vmovdqu [edx], ymm0 // write 32 words to destination
|
|
vmovdqu [edx + 32], ymm1
|
|
lea edx, [edx + 64]
|
|
sub ecx, 32
|
|
jg xloop
|
|
|
|
vzeroupper
|
|
ret
|
|
}
|
|
}
|
|
#endif // HAS_SCALEADDROW_AVX2
|
|
|
|
// Constant for making pixels signed to avoid pmaddubsw
|
|
// saturation.
|
|
static uvec8 kFsub80 =
|
|
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
|
|
|
// Constant for making pixels unsigned and adding .5 for rounding.
|
|
static uvec16 kFadd40 =
|
|
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
|
|
|
|
// Bilinear column filtering. SSSE3 version.
|
|
__declspec(naked)
|
|
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|
int dst_width, int x, int dx) {
|
|
__asm {
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov edi, [esp + 12 + 4] // dst_ptr
|
|
mov esi, [esp + 12 + 8] // src_ptr
|
|
mov ecx, [esp + 12 + 12] // dst_width
|
|
movd xmm2, [esp + 12 + 16] // x
|
|
movd xmm3, [esp + 12 + 20] // dx
|
|
mov eax, 0x04040000 // shuffle to line up fractions with pixel.
|
|
movd xmm5, eax
|
|
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
|
psrlw xmm6, 9
|
|
pcmpeqb xmm7, xmm7 // generate 0x0001
|
|
psrlw xmm7, 15
|
|
pextrw eax, xmm2, 1 // get x0 integer. preroll
|
|
sub ecx, 2
|
|
jl xloop29
|
|
|
|
movdqa xmm0, xmm2 // x1 = x0 + dx
|
|
paddd xmm0, xmm3
|
|
punpckldq xmm2, xmm0 // x0 x1
|
|
punpckldq xmm3, xmm3 // dx dx
|
|
paddd xmm3, xmm3 // dx * 2, dx * 2
|
|
pextrw edx, xmm2, 3 // get x1 integer. preroll
|
|
|
|
// 2 Pixel loop.
|
|
xloop2:
|
|
movdqa xmm1, xmm2 // x0, x1 fractions.
|
|
paddd xmm2, xmm3 // x += dx
|
|
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
|
movd xmm0, ebx
|
|
psrlw xmm1, 9 // 7 bit fractions.
|
|
movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
|
|
movd xmm4, ebx
|
|
pshufb xmm1, xmm5 // 0011
|
|
punpcklwd xmm0, xmm4
|
|
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
|
pxor xmm1, xmm6 // 0..7f and 7f..0
|
|
paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
|
|
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
|
|
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
|
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
|
paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
|
|
psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
|
|
packuswb xmm1, xmm1 // 8 bits, 2 pixels.
|
|
movd ebx, xmm1
|
|
mov [edi], bx
|
|
lea edi, [edi + 2]
|
|
sub ecx, 2 // 2 pixels
|
|
jge xloop2
|
|
|
|
xloop29:
|
|
add ecx, 2 - 1
|
|
jl xloop99
|
|
|
|
// 1 pixel remainder
|
|
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
|
movd xmm0, ebx
|
|
psrlw xmm2, 9 // 7 bit fractions.
|
|
pshufb xmm2, xmm5 // 0011
|
|
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
|
|
pxor xmm2, xmm6 // 0..7f and 7f..0
|
|
paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
|
|
pmaddubsw xmm2, xmm0 // 16 bit
|
|
paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
|
|
psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
|
|
packuswb xmm2, xmm2 // 8 bits
|
|
movd ebx, xmm2
|
|
mov [edi], bl
|
|
|
|
xloop99:
|
|
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Reads 16 pixels, duplicates them and writes 32 pixels.
|
|
__declspec(naked)
|
|
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
|
int dst_width, int x, int dx) {
|
|
__asm {
|
|
mov edx, [esp + 4] // dst_ptr
|
|
mov eax, [esp + 8] // src_ptr
|
|
mov ecx, [esp + 12] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
lea eax, [eax + 16]
|
|
movdqa xmm1, xmm0
|
|
punpcklbw xmm0, xmm0
|
|
punpckhbw xmm1, xmm1
|
|
movdqu [edx], xmm0
|
|
movdqu [edx + 16], xmm1
|
|
lea edx, [edx + 32]
|
|
sub ecx, 32
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
|
|
__declspec(naked)
|
|
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_argb, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_argb
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_argb
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
shufps xmm0, xmm1, 0xdd
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 4
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 8x1 rectangle to 4x1.
|
|
__declspec(naked)
|
|
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_argb, int dst_width) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_argb
|
|
// src_stride ignored
|
|
mov edx, [esp + 12] // dst_argb
|
|
mov ecx, [esp + 16] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
lea eax, [eax + 32]
|
|
movdqa xmm2, xmm0
|
|
shufps xmm0, xmm1, 0x88 // even pixels
|
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
|
pavgb xmm0, xmm2
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 4
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends 8x2 rectangle to 4x1.
|
|
__declspec(naked)
|
|
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
|
ptrdiff_t src_stride,
|
|
uint8* dst_argb, int dst_width) {
|
|
__asm {
|
|
push esi
|
|
mov eax, [esp + 4 + 4] // src_argb
|
|
mov esi, [esp + 4 + 8] // src_stride
|
|
mov edx, [esp + 4 + 12] // dst_argb
|
|
mov ecx, [esp + 4 + 16] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
movdqu xmm1, [eax + 16]
|
|
movdqu xmm2, [eax + esi]
|
|
movdqu xmm3, [eax + esi + 16]
|
|
lea eax, [eax + 32]
|
|
pavgb xmm0, xmm2 // average rows
|
|
pavgb xmm1, xmm3
|
|
movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
|
|
shufps xmm0, xmm1, 0x88 // even pixels
|
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
|
pavgb xmm0, xmm2
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 4
|
|
jg wloop
|
|
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Reads 4 pixels at a time.
|
|
__declspec(naked)
|
|
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
|
int src_stepx,
|
|
uint8* dst_argb, int dst_width) {
|
|
__asm {
|
|
push ebx
|
|
push edi
|
|
mov eax, [esp + 8 + 4] // src_argb
|
|
// src_stride ignored
|
|
mov ebx, [esp + 8 + 12] // src_stepx
|
|
mov edx, [esp + 8 + 16] // dst_argb
|
|
mov ecx, [esp + 8 + 20] // dst_width
|
|
lea ebx, [ebx * 4]
|
|
lea edi, [ebx + ebx * 2]
|
|
|
|
wloop:
|
|
movd xmm0, [eax]
|
|
movd xmm1, [eax + ebx]
|
|
punpckldq xmm0, xmm1
|
|
movd xmm2, [eax + ebx * 2]
|
|
movd xmm3, [eax + edi]
|
|
lea eax, [eax + ebx * 4]
|
|
punpckldq xmm2, xmm3
|
|
punpcklqdq xmm0, xmm2
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 4
|
|
jg wloop
|
|
|
|
pop edi
|
|
pop ebx
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Blends four 2x2 to 4x1.
|
|
__declspec(naked)
|
|
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
|
ptrdiff_t src_stride,
|
|
int src_stepx,
|
|
uint8* dst_argb, int dst_width) {
|
|
__asm {
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
mov eax, [esp + 12 + 4] // src_argb
|
|
mov esi, [esp + 12 + 8] // src_stride
|
|
mov ebx, [esp + 12 + 12] // src_stepx
|
|
mov edx, [esp + 12 + 16] // dst_argb
|
|
mov ecx, [esp + 12 + 20] // dst_width
|
|
lea esi, [eax + esi] // row1 pointer
|
|
lea ebx, [ebx * 4]
|
|
lea edi, [ebx + ebx * 2]
|
|
|
|
wloop:
|
|
movq xmm0, qword ptr [eax] // row0 4 pairs
|
|
movhps xmm0, qword ptr [eax + ebx]
|
|
movq xmm1, qword ptr [eax + ebx * 2]
|
|
movhps xmm1, qword ptr [eax + edi]
|
|
lea eax, [eax + ebx * 4]
|
|
movq xmm2, qword ptr [esi] // row1 4 pairs
|
|
movhps xmm2, qword ptr [esi + ebx]
|
|
movq xmm3, qword ptr [esi + ebx * 2]
|
|
movhps xmm3, qword ptr [esi + edi]
|
|
lea esi, [esi + ebx * 4]
|
|
pavgb xmm0, xmm2 // average rows
|
|
pavgb xmm1, xmm3
|
|
movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
|
|
shufps xmm0, xmm1, 0x88 // even pixels
|
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
|
pavgb xmm0, xmm2
|
|
movdqu [edx], xmm0
|
|
lea edx, [edx + 16]
|
|
sub ecx, 4
|
|
jg wloop
|
|
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Column scaling unfiltered. SSE2 version.
|
|
__declspec(naked)
|
|
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
|
int dst_width, int x, int dx) {
|
|
__asm {
|
|
push edi
|
|
push esi
|
|
mov edi, [esp + 8 + 4] // dst_argb
|
|
mov esi, [esp + 8 + 8] // src_argb
|
|
mov ecx, [esp + 8 + 12] // dst_width
|
|
movd xmm2, [esp + 8 + 16] // x
|
|
movd xmm3, [esp + 8 + 20] // dx
|
|
|
|
pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
|
|
pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
|
|
paddd xmm2, xmm0
|
|
paddd xmm3, xmm3 // 0, 0, 0, dx * 2
|
|
pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
|
|
paddd xmm2, xmm0 // x3 x2 x1 x0
|
|
paddd xmm3, xmm3 // 0, 0, 0, dx * 4
|
|
pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
|
|
|
|
pextrw eax, xmm2, 1 // get x0 integer.
|
|
pextrw edx, xmm2, 3 // get x1 integer.
|
|
|
|
cmp ecx, 0
|
|
jle xloop99
|
|
sub ecx, 4
|
|
jl xloop49
|
|
|
|
// 4 Pixel loop.
|
|
xloop4:
|
|
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
|
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
|
pextrw eax, xmm2, 5 // get x2 integer.
|
|
pextrw edx, xmm2, 7 // get x3 integer.
|
|
paddd xmm2, xmm3 // x += dx
|
|
punpckldq xmm0, xmm1 // x0 x1
|
|
|
|
movd xmm1, [esi + eax * 4] // 1 source x2 pixels
|
|
movd xmm4, [esi + edx * 4] // 1 source x3 pixels
|
|
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
|
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
|
punpckldq xmm1, xmm4 // x2 x3
|
|
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
|
|
movdqu [edi], xmm0
|
|
lea edi, [edi + 16]
|
|
sub ecx, 4 // 4 pixels
|
|
jge xloop4
|
|
|
|
xloop49:
|
|
test ecx, 2
|
|
je xloop29
|
|
|
|
// 2 Pixels.
|
|
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
|
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
|
pextrw eax, xmm2, 5 // get x2 integer.
|
|
punpckldq xmm0, xmm1 // x0 x1
|
|
|
|
movq qword ptr [edi], xmm0
|
|
lea edi, [edi + 8]
|
|
|
|
xloop29:
|
|
test ecx, 1
|
|
je xloop99
|
|
|
|
// 1 Pixels.
|
|
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
|
|
movd dword ptr [edi], xmm0
|
|
xloop99:
|
|
|
|
pop esi
|
|
pop edi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
|
|
// TODO(fbarchard): Port to Neon
|
|
|
|
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
|
static uvec8 kShuffleColARGB = {
|
|
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
|
|
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
|
|
};
|
|
|
|
// Shuffle table for duplicating 2 fractions into 8 bytes each
|
|
static uvec8 kShuffleFractions = {
|
|
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
|
|
};
|
|
|
|
__declspec(naked)
|
|
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
|
int dst_width, int x, int dx) {
|
|
__asm {
|
|
push esi
|
|
push edi
|
|
mov edi, [esp + 8 + 4] // dst_argb
|
|
mov esi, [esp + 8 + 8] // src_argb
|
|
mov ecx, [esp + 8 + 12] // dst_width
|
|
movd xmm2, [esp + 8 + 16] // x
|
|
movd xmm3, [esp + 8 + 20] // dx
|
|
movdqa xmm4, xmmword ptr kShuffleColARGB
|
|
movdqa xmm5, xmmword ptr kShuffleFractions
|
|
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
|
|
psrlw xmm6, 9
|
|
pextrw eax, xmm2, 1 // get x0 integer. preroll
|
|
sub ecx, 2
|
|
jl xloop29
|
|
|
|
movdqa xmm0, xmm2 // x1 = x0 + dx
|
|
paddd xmm0, xmm3
|
|
punpckldq xmm2, xmm0 // x0 x1
|
|
punpckldq xmm3, xmm3 // dx dx
|
|
paddd xmm3, xmm3 // dx * 2, dx * 2
|
|
pextrw edx, xmm2, 3 // get x1 integer. preroll
|
|
|
|
// 2 Pixel loop.
|
|
xloop2:
|
|
movdqa xmm1, xmm2 // x0, x1 fractions.
|
|
paddd xmm2, xmm3 // x += dx
|
|
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
|
psrlw xmm1, 9 // 7 bit fractions.
|
|
movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
|
|
pshufb xmm1, xmm5 // 0000000011111111
|
|
pshufb xmm0, xmm4 // arrange pixels into pairs
|
|
pxor xmm1, xmm6 // 0..7f and 7f..0
|
|
pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
|
|
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
|
|
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
|
psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
|
|
packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
|
|
movq qword ptr [edi], xmm0
|
|
lea edi, [edi + 8]
|
|
sub ecx, 2 // 2 pixels
|
|
jge xloop2
|
|
|
|
xloop29:
|
|
|
|
add ecx, 2 - 1
|
|
jl xloop99
|
|
|
|
// 1 pixel remainder
|
|
psrlw xmm2, 9 // 7 bit fractions.
|
|
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
|
pshufb xmm2, xmm5 // 00000000
|
|
pshufb xmm0, xmm4 // arrange pixels into pairs
|
|
pxor xmm2, xmm6 // 0..7f and 7f..0
|
|
pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
|
|
psrlw xmm0, 7
|
|
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
|
|
movd [edi], xmm0
|
|
|
|
xloop99:
|
|
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Reads 4 pixels, duplicates them and writes 8 pixels.
|
|
__declspec(naked)
|
|
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
|
int dst_width, int x, int dx) {
|
|
__asm {
|
|
mov edx, [esp + 4] // dst_argb
|
|
mov eax, [esp + 8] // src_argb
|
|
mov ecx, [esp + 12] // dst_width
|
|
|
|
wloop:
|
|
movdqu xmm0, [eax]
|
|
lea eax, [eax + 16]
|
|
movdqa xmm1, xmm0
|
|
punpckldq xmm0, xmm0
|
|
punpckhdq xmm1, xmm1
|
|
movdqu [edx], xmm0
|
|
movdqu [edx + 16], xmm1
|
|
lea edx, [edx + 32]
|
|
sub ecx, 8
|
|
jg wloop
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Divide num by div and return as 16.16 fixed point result.
|
|
__declspec(naked)
|
|
int FixedDiv_X86(int num, int div) {
|
|
__asm {
|
|
mov eax, [esp + 4] // num
|
|
cdq // extend num to 64 bits
|
|
shld edx, eax, 16 // 32.16
|
|
shl eax, 16
|
|
idiv dword ptr [esp + 8]
|
|
ret
|
|
}
|
|
}
|
|
|
|
// Divide num by div and return as 16.16 fixed point result.
|
|
__declspec(naked)
|
|
int FixedDiv1_X86(int num, int div) {
|
|
__asm {
|
|
mov eax, [esp + 4] // num
|
|
mov ecx, [esp + 8] // denom
|
|
cdq // extend num to 64 bits
|
|
shld edx, eax, 16 // 32.16
|
|
shl eax, 16
|
|
sub eax, 0x00010001
|
|
sbb edx, 0
|
|
sub ecx, 1
|
|
idiv ecx
|
|
ret
|
|
}
|
|
}
|
|
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|