412 lines
13 KiB
C
412 lines
13 KiB
C
/***********************************************************************/
|
|
/* */
|
|
/* Objective Caml */
|
|
/* */
|
|
/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
|
|
/* */
|
|
/* Copyright 2003 Institut National de Recherche en Informatique et */
|
|
/* en Automatique. All rights reserved. This file is distributed */
|
|
/* under the terms of the GNU Library General Public License, with */
|
|
/* the special exception on linking described in file ../../LICENSE. */
|
|
/* */
|
|
/***********************************************************************/
|
|
|
|
/* $Id$ */
|
|
|
|
/* Code specific to the Intel IA32 (x86) architecture. */
|
|
|
|
#define BngAdd2(res,carryout,arg1,arg2) \
|
|
asm("xorl %1, %1 \n\t" \
|
|
"addl %3, %0 \n\t" \
|
|
"setc %b1" \
|
|
: "=r" (res), "=&q" (carryout) \
|
|
: "0" (arg1), "rm" (arg2))
|
|
|
|
#define BngSub2(res,carryout,arg1,arg2) \
|
|
asm("xorl %1, %1 \n\t" \
|
|
"subl %3, %0 \n\t" \
|
|
"setc %b1" \
|
|
: "=r" (res), "=&q" (carryout) \
|
|
: "0" (arg1), "rm" (arg2))
|
|
|
|
#define BngMult(resh,resl,arg1,arg2) \
|
|
asm("mull %3" \
|
|
: "=a" (resl), "=d" (resh) \
|
|
: "a" (arg1), "r" (arg2))
|
|
|
|
#define BngDiv(quo,rem,nh,nl,d) \
|
|
asm("divl %4" \
|
|
: "=a" (quo), "=d" (rem) \
|
|
: "a" (nl), "d" (nh), "r" (d))
|
|
|
|
/* Reimplementation in asm of some of the bng operations. */
|
|
|
|
static bngcarry bng_ia32_add
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngcarry carry)
|
|
{
|
|
bngdigit tmp;
|
|
alen -= blen;
|
|
if (blen > 0) {
|
|
asm("negb %b3 \n\t"
|
|
"1: \n\t"
|
|
"movl (%0), %4 \n\t"
|
|
"adcl (%1), %4 \n\t"
|
|
"movl %4, (%0) \n\t"
|
|
"leal 4(%0), %0 \n\t"
|
|
"leal 4(%1), %1 \n\t"
|
|
"decl %2 \n\t"
|
|
"jnz 1b \n\t"
|
|
"setc %b3"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
|
|
}
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if (++(*a) != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngcarry bng_ia32_sub
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngcarry carry)
|
|
{
|
|
bngdigit tmp;
|
|
alen -= blen;
|
|
if (blen > 0) {
|
|
asm("negb %b3 \n\t"
|
|
"1: \n\t"
|
|
"movl (%0), %4 \n\t"
|
|
"sbbl (%1), %4 \n\t"
|
|
"movl %4, (%0) \n\t"
|
|
"leal 4(%0), %0 \n\t"
|
|
"leal 4(%1), %1 \n\t"
|
|
"decl %2 \n\t"
|
|
"jnz 1b \n\t"
|
|
"setc %b3"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
|
|
}
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if ((*a)-- != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngdigit bng_ia32_mult_add_digit
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngdigit d)
|
|
{
|
|
bngdigit out;
|
|
bngcarry carry;
|
|
|
|
alen -= blen;
|
|
out = 0;
|
|
if (blen > 0) {
|
|
asm("1: \n\t"
|
|
"movl (%1), %%eax \n\t"
|
|
"mull %4\n\t" /* edx:eax = d * next digit of b */
|
|
"addl (%0), %%eax \n\t" /* add next digit of a to eax */
|
|
"adcl $0, %%edx \n\t" /* accumulate carry in edx */
|
|
"addl %3, %%eax \n\t" /* add out to eax */
|
|
"adcl $0, %%edx \n\t" /* accumulate carry in edx */
|
|
"movl %%eax, (%0) \n\t" /* eax is next digit of result */
|
|
"movl %%edx, %3 \n\t" /* edx is next out */
|
|
"leal 4(%0), %0 \n\t"
|
|
"leal 4(%1), %1 \n\t"
|
|
"decl %2 \n\t"
|
|
"jnz 1b"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "=m" (out)
|
|
: "m" (d)
|
|
: "eax", "edx");
|
|
}
|
|
if (alen == 0) return out;
|
|
/* current digit of a += out */
|
|
BngAdd2(*a, carry, *a, out);
|
|
a++;
|
|
alen--;
|
|
/* Propagate carry */
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if (++(*a) != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngdigit bng_ia32_mult_sub_digit
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngdigit d)
|
|
{
|
|
bngdigit out, tmp;
|
|
bngcarry carry;
|
|
|
|
alen -= blen;
|
|
out = 0;
|
|
if (blen > 0) {
|
|
asm("1: \n\t"
|
|
"movl (%1), %%eax \n\t"
|
|
"movl (%0), %4 \n\t"
|
|
"mull %5\n\t" /* edx:eax = d * next digit of b */
|
|
"subl %%eax, %4 \n\t" /* subtract eax from next digit of a */
|
|
"adcl $0, %%edx \n\t" /* accumulate carry in edx */
|
|
"subl %3, %4 \n\t" /* subtract out */
|
|
"adcl $0, %%edx \n\t" /* accumulate carry in edx */
|
|
"movl %4, (%0) \n\t" /* store next digit of result */
|
|
"movl %%edx, %3 \n\t" /* edx is next out */
|
|
"leal 4(%0), %0 \n\t"
|
|
"leal 4(%1), %1 \n\t"
|
|
"decl %2 \n\t"
|
|
"jnz 1b"
|
|
: "+&r" (a), "+&r" (b), "=m" (blen), "=m" (out), "=&r" (tmp)
|
|
: "m" (d)
|
|
: "eax", "edx");
|
|
}
|
|
if (alen == 0) return out;
|
|
/* current digit of a -= out */
|
|
BngSub2(*a, carry, *a, out);
|
|
a++;
|
|
alen--;
|
|
/* Propagate carry */
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if ((*a)-- != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
/* This is another asm implementation of some of the bng operations,
|
|
using SSE2 operations to provide 64-bit arithmetic.
|
|
This is faster than the plain IA32 code above on the Pentium 4.
|
|
(Arithmetic operations with carry are slow on the Pentium 4). */
|
|
|
|
#if BNG_ASM_LEVEL >= 2
|
|
|
|
static bngcarry bng_ia32sse2_add
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngcarry carry)
|
|
{
|
|
alen -= blen;
|
|
if (blen > 0) {
|
|
asm("movd %3, %%mm0 \n\t" /* MM0 is carry */
|
|
"1: \n\t"
|
|
"movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
|
|
"movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
|
|
"paddq %%mm1, %%mm0 \n\t" /* Add carry (64 bits) */
|
|
"paddq %%mm2, %%mm0 \n\t" /* Add digits (64 bits) */
|
|
"movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
|
|
"psrlq $32, %%mm0 \n\t" /* Next carry is top 32 bits of results */
|
|
"addl $4, %0\n\t"
|
|
"addl $4, %1\n\t"
|
|
"subl $1, %2\n\t"
|
|
"jne 1b \n\t"
|
|
"movd %%mm0, %3 \n\t"
|
|
"emms"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
|
|
}
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if (++(*a) != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngcarry bng_ia32sse2_sub
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngcarry carry)
|
|
{
|
|
alen -= blen;
|
|
if (blen > 0) {
|
|
asm("movd %3, %%mm0 \n\t" /* MM0 is carry */
|
|
"1: \n\t"
|
|
"movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
|
|
"movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
|
|
"psubq %%mm0, %%mm1 \n\t" /* Subtract carry (64 bits) */
|
|
"psubq %%mm2, %%mm1 \n\t" /* Subtract digits (64 bits) */
|
|
"movd %%mm1, (%0) \n\t" /* Store low 32 bits of result */
|
|
"psrlq $63, %%mm1 \n\t" /* Next carry is sign bit of result */
|
|
"movq %%mm1, %%mm0 \n\t"
|
|
"addl $4, %0\n\t"
|
|
"addl $4, %1\n\t"
|
|
"subl $1, %2\n\t"
|
|
"jne 1b \n\t"
|
|
"movd %%mm0, %3 \n\t"
|
|
"emms"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
|
|
}
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if ((*a)-- != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngdigit bng_ia32sse2_mult_add_digit
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngdigit d)
|
|
{
|
|
bngdigit out;
|
|
bngcarry carry;
|
|
|
|
alen -= blen;
|
|
out = 0;
|
|
if (blen > 0) {
|
|
asm("pxor %%mm0, %%mm0 \n\t" /* MM0 is carry */
|
|
"movd %4, %%mm7 \n\t" /* MM7 is digit d */
|
|
"1: \n\t"
|
|
"movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
|
|
"movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
|
|
"pmuludq %%mm7, %%mm2 \n\t" /* MM2 = d * digit of b */
|
|
"paddq %%mm1, %%mm0 \n\t" /* Add product and carry ... */
|
|
"paddq %%mm2, %%mm0 \n\t" /* ... and digit of a */
|
|
"movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
|
|
"psrlq $32, %%mm0 \n\t" /* Next carry is high 32 bits result */
|
|
"addl $4, %0\n\t"
|
|
"addl $4, %1\n\t"
|
|
"subl $1, %2\n\t"
|
|
"jne 1b \n\t"
|
|
"movd %%mm0, %3 \n\t"
|
|
"emms"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
|
|
: "m" (d));
|
|
}
|
|
if (alen == 0) return out;
|
|
/* current digit of a += out */
|
|
BngAdd2(*a, carry, *a, out);
|
|
a++;
|
|
alen--;
|
|
/* Propagate carry */
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if (++(*a) != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
static bngdigit bng_ia32sse2_mult_sub_digit
|
|
(bng a/*[alen]*/, bngsize alen,
|
|
bng b/*[blen]*/, bngsize blen,
|
|
bngdigit d)
|
|
{
|
|
static unsigned long long bias1 = 0xFFFFFFFF00000000ULL - 0xFFFFFFFFULL;
|
|
static unsigned long bias2 = 0xFFFFFFFFUL;
|
|
bngdigit out;
|
|
bngcarry carry;
|
|
|
|
alen -= blen;
|
|
out = 0;
|
|
if (blen > 0) {
|
|
/* Carry C is represented by ENC(C) = 0xFFFFFFFF - C (one's complement) */
|
|
asm("movd %6, %%mm0 \n\t" /* MM0 is carry (initially 0xFFFFFFFF) */
|
|
"movq %5, %%mm6 \n\t" /* MM6 is magic constant bias1 */
|
|
"movd %4, %%mm7 \n\t" /* MM7 is digit d */
|
|
"1: \n\t"
|
|
"movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
|
|
"movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
|
|
"paddq %%mm6, %%mm1 \n\t" /* bias digit of a */
|
|
"pmuludq %%mm7, %%mm2 \n\t" /* MM2 = d * digit of b */
|
|
/* Compute
|
|
digit of a + ENC(carry) + 0xFFFFFFFF00000000 - 0xFFFFFFFF - product
|
|
= digit of a - carry + 0xFFFFFFFF00000000 - product
|
|
= digit of a - carry - productlow + (ENC(nextcarry) << 32) */
|
|
"psubq %%mm2, %%mm1 \n\t"
|
|
"paddq %%mm1, %%mm0 \n\t"
|
|
"movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
|
|
"psrlq $32, %%mm0 \n\t" /* Next carry is 32 high bits of result */
|
|
"addl $4, %0\n\t"
|
|
"addl $4, %1\n\t"
|
|
"subl $1, %2\n\t"
|
|
"jne 1b \n\t"
|
|
"movd %%mm0, %3 \n\t"
|
|
"emms"
|
|
: "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
|
|
: "m" (d), "m" (bias1), "m" (bias2));
|
|
out = ~out; /* Undo encoding on out digit */
|
|
}
|
|
if (alen == 0) return out;
|
|
/* current digit of a -= out */
|
|
BngSub2(*a, carry, *a, out);
|
|
a++;
|
|
alen--;
|
|
/* Propagate carry */
|
|
if (carry == 0 || alen == 0) return carry;
|
|
do {
|
|
if ((*a)-- != 0) return 0;
|
|
a++;
|
|
} while (--alen);
|
|
return 1;
|
|
}
|
|
|
|
/* Detect whether SSE2 instructions are supported */
|
|
|
|
static int bng_ia32_sse2_supported(void)
|
|
{
|
|
unsigned int flags, newflags, max_id, capabilities;
|
|
|
|
#define EFLAG_CPUID 0x00200000
|
|
#define CPUID_IDENTIFY 0
|
|
#define CPUID_CAPABILITIES 1
|
|
#define SSE2_CAPABILITY 26
|
|
|
|
/* Check if processor has CPUID instruction */
|
|
asm("pushfl \n\t"
|
|
"popl %0"
|
|
: "=r" (flags) : );
|
|
newflags = flags ^ EFLAG_CPUID; /* CPUID detection flag */
|
|
asm("pushfl \n\t"
|
|
"pushl %1 \n\t"
|
|
"popfl \n\t"
|
|
"pushfl \n\t"
|
|
"popl %0 \n\t"
|
|
"popfl"
|
|
: "=r" (flags) : "r" (newflags));
|
|
/* If CPUID detection flag cannot be changed, CPUID instruction is not
|
|
available */
|
|
if ((flags & EFLAG_CPUID) != (newflags & EFLAG_CPUID)) return 0;
|
|
/* See if SSE2 extensions are supported */
|
|
asm("pushl %%ebx \n\t" /* need to preserve %ebx for PIC */
|
|
"cpuid \n\t"
|
|
"popl %%ebx"
|
|
: "=a" (max_id) : "a" (CPUID_IDENTIFY): "ecx", "edx");
|
|
if (max_id < 1) return 0;
|
|
asm("pushl %%ebx \n\t"
|
|
"cpuid \n\t"
|
|
"popl %%ebx"
|
|
: "=d" (capabilities) : "a" (CPUID_CAPABILITIES) : "ecx");
|
|
return capabilities & (1 << SSE2_CAPABILITY);
|
|
}
|
|
|
|
#endif
|
|
|
|
static void bng_ia32_setup_ops(void)
|
|
{
|
|
#if BNG_ASM_LEVEL >= 2
|
|
if (bng_ia32_sse2_supported()) {
|
|
bng_ops.add = bng_ia32sse2_add;
|
|
bng_ops.sub = bng_ia32sse2_sub;
|
|
bng_ops.mult_add_digit = bng_ia32sse2_mult_add_digit;
|
|
bng_ops.mult_sub_digit = bng_ia32sse2_mult_sub_digit;
|
|
return;
|
|
}
|
|
#endif
|
|
bng_ops.add = bng_ia32_add;
|
|
bng_ops.sub = bng_ia32_sub;
|
|
bng_ops.mult_add_digit = bng_ia32_mult_add_digit;
|
|
bng_ops.mult_sub_digit = bng_ia32_mult_sub_digit;
|
|
}
|
|
|
|
#define BNG_SETUP_OPS bng_ia32_setup_ops()
|