ocaml/otherlibs/num/bng_ia32.c

/***********************************************************************/
/*                                                                     */
/*                           Objective Caml                            */
/*                                                                     */
/*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         */
/*                                                                     */
/*  Copyright 2003 Institut National de Recherche en Informatique et   */
/*  en Automatique.  All rights reserved.  This file is distributed    */
/*  under the terms of the GNU Library General Public License, with    */
/*  the special exception on linking described in file ../../LICENSE.  */
/*                                                                     */
/***********************************************************************/

/* $Id$ */

/* Code specific to the Intel IA32 (x86) architecture. */

#define BngAdd2(res,carryout,arg1,arg2)                                     \
  asm("xorl %1, %1 \n\t"                                                    \
      "addl %3, %0 \n\t"                                                    \
      "setc %b1"                                                            \
      : "=r" (res), "=&q" (carryout)                                        \
      : "0" (arg1), "rm" (arg2))

#define BngSub2(res,carryout,arg1,arg2)                                     \
  asm("xorl %1, %1 \n\t"                                                    \
      "subl %3, %0 \n\t"                                                    \
      "setc %b1"                                                            \
      : "=r" (res), "=&q" (carryout)                                        \
      : "0" (arg1), "rm" (arg2))

#define BngMult(resh,resl,arg1,arg2)                                        \
  asm("mull %3"                                                             \
      : "=a" (resl), "=d" (resh)                                            \
      : "a" (arg1), "r" (arg2))

#define BngDiv(quo,rem,nh,nl,d)                                             \
  asm("divl %4"                                                             \
      : "=a" (quo), "=d" (rem)                                              \
      : "a" (nl), "d" (nh), "r" (d))

/* Reimplementation in asm of some of the bng operations. */

static bngcarry bng_ia32_add
       (bng a/*[alen]*/, bngsize alen,
        bng b/*[blen]*/, bngsize blen,
        bngcarry carry)
{
  bngdigit tmp;
  alen -= blen;
  if (blen > 0) {
    asm("negb %b3 \n\t"
        "1: \n\t"
        "movl (%0), %4 \n\t"
        "adcl (%1), %4 \n\t"
        "movl %4, (%0) \n\t"
        "leal 4(%0), %0 \n\t"
        "leal 4(%1), %1 \n\t"
        "decl %2 \n\t"
        "jnz 1b \n\t"
        "setc %b3"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
  }
  if (carry == 0 || alen == 0) return carry;
  do {
    if (++(*a) != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngcarry bng_ia32_sub
       (bng a/*[alen]*/, bngsize alen,
        bng b/*[blen]*/, bngsize blen,
        bngcarry carry)
{
  bngdigit tmp;
  alen -= blen;
  if (blen > 0) {
    asm("negb %b3 \n\t"
        "1: \n\t"
        "movl (%0), %4 \n\t"
        "sbbl (%1), %4 \n\t"
        "movl %4, (%0) \n\t"
        "leal 4(%0), %0 \n\t"
        "leal 4(%1), %1 \n\t"
        "decl %2 \n\t"
        "jnz 1b \n\t"
        "setc %b3"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
  }
  if (carry == 0 || alen == 0) return carry;
  do {
    if ((*a)-- != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngdigit bng_ia32_mult_add_digit
     (bng a/*[alen]*/, bngsize alen,
      bng b/*[blen]*/, bngsize blen,
      bngdigit d)
{
  bngdigit out;
  bngcarry carry;

  alen -= blen;
  out = 0;
  if (blen > 0) {
    asm("1: \n\t"
        "movl (%1), %%eax \n\t"
        "mull %4\n\t"           /* edx:eax = d * next digit of b */
        "addl (%0), %%eax \n\t" /* add next digit of a to eax */
        "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
        "addl %3, %%eax \n\t"   /* add out to eax */
        "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
        "movl %%eax, (%0) \n\t" /* eax is next digit of result */
        "movl %%edx, %3 \n\t"   /* edx is next out */
        "leal 4(%0), %0 \n\t"
        "leal 4(%1), %1 \n\t"
        "decl %2 \n\t"
        "jnz 1b"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "=m" (out)
        : "m" (d)
        : "eax", "edx");
  }
  if (alen == 0) return out;
  /* current digit of a += out */
  BngAdd2(*a, carry, *a, out);
  a++;
  alen--;
  /* Propagate carry */
  if (carry == 0 || alen == 0) return carry;
  do {
    if (++(*a) != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngdigit bng_ia32_mult_sub_digit
     (bng a/*[alen]*/, bngsize alen,
      bng b/*[blen]*/, bngsize blen,
      bngdigit d)
{
  bngdigit out, tmp;
  bngcarry carry;

  alen -= blen;
  out = 0;
  if (blen > 0) {
    asm("1: \n\t"
        "movl (%1), %%eax \n\t"
        "movl (%0), %4 \n\t"
        "mull %5\n\t"           /* edx:eax = d * next digit of b */
        "subl %%eax, %4 \n\t"   /* subtract eax from next digit of a */
        "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
        "subl %3, %4 \n\t"      /* subtract out */
        "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
        "movl %4, (%0) \n\t"    /* store next digit of result */
        "movl %%edx, %3 \n\t"   /* edx is next out */
        "leal 4(%0), %0 \n\t"
        "leal 4(%1), %1 \n\t"
        "decl %2 \n\t"
        "jnz 1b"
        : "+&r" (a), "+&r" (b), "=m" (blen), "=m" (out), "=&r" (tmp)
        : "m" (d)
        : "eax", "edx");
  }
  if (alen == 0) return out;
  /* current digit of a -= out */
  BngSub2(*a, carry, *a, out);
  a++;
  alen--;
  /* Propagate carry */
  if (carry == 0 || alen == 0) return carry;
  do {
    if ((*a)-- != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

/* This is another asm implementation of some of the bng operations,
   using SSE2 operations to provide 64-bit arithmetic.
   This is faster than the plain IA32 code above on the Pentium 4.
   (Arithmetic operations with carry are slow on the Pentium 4). */

#if BNG_ASM_LEVEL >= 2

static bngcarry bng_ia32sse2_add
       (bng a/*[alen]*/, bngsize alen,
        bng b/*[blen]*/, bngsize blen,
        bngcarry carry)
{
  alen -= blen;
  if (blen > 0) {
    asm("movd %3, %%mm0 \n\t"       /* MM0 is carry */
        "1: \n\t"
        "movd (%0), %%mm1 \n\t"     /* MM1 is next digit of a */
        "movd (%1), %%mm2 \n\t"     /* MM2 is next digit of b */
        "paddq %%mm1, %%mm0 \n\t"   /* Add carry (64 bits) */
        "paddq %%mm2, %%mm0 \n\t"   /* Add digits (64 bits) */
        "movd %%mm0, (%0) \n\t"     /* Store low 32 bits of result */
        "psrlq $32, %%mm0 \n\t"     /* Next carry is top 32 bits of results */
        "addl $4, %0\n\t"
        "addl $4, %1\n\t"
        "subl $1, %2\n\t"
        "jne 1b \n\t"
        "movd %%mm0, %3 \n\t"
        "emms"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
  }
  if (carry == 0 || alen == 0) return carry;
  do {
    if (++(*a) != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngcarry bng_ia32sse2_sub
       (bng a/*[alen]*/, bngsize alen,
        bng b/*[blen]*/, bngsize blen,
        bngcarry carry)
{
  alen -= blen;
  if (blen > 0) {
    asm("movd %3, %%mm0 \n\t"       /* MM0 is carry */
        "1: \n\t"
        "movd (%0), %%mm1 \n\t"     /* MM1 is next digit of a */
        "movd (%1), %%mm2 \n\t"     /* MM2 is next digit of b */
        "psubq %%mm0, %%mm1 \n\t"   /* Subtract carry (64 bits) */
        "psubq %%mm2, %%mm1 \n\t"   /* Subtract digits (64 bits) */
        "movd %%mm1, (%0) \n\t"     /* Store low 32 bits of result */
        "psrlq $63, %%mm1 \n\t"     /* Next carry is sign bit of result */
        "movq %%mm1, %%mm0 \n\t"
        "addl $4, %0\n\t"
        "addl $4, %1\n\t"
        "subl $1, %2\n\t"
        "jne 1b \n\t"
        "movd %%mm0, %3 \n\t"
        "emms"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
  }
  if (carry == 0 || alen == 0) return carry;
  do {
    if ((*a)-- != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngdigit bng_ia32sse2_mult_add_digit
     (bng a/*[alen]*/, bngsize alen,
      bng b/*[blen]*/, bngsize blen,
      bngdigit d)
{
  bngdigit out;
  bngcarry carry;

  alen -= blen;
  out = 0;
  if (blen > 0) {
    asm("pxor %%mm0, %%mm0 \n\t"      /* MM0 is carry */
        "movd %4, %%mm7 \n\t"         /* MM7 is digit d */
        "1: \n\t"
        "movd (%0), %%mm1 \n\t"       /* MM1 is next digit of a */
        "movd (%1), %%mm2 \n\t"       /* MM2 is next digit of b */
        "pmuludq %%mm7, %%mm2 \n\t"   /* MM2 = d * digit of b */
        "paddq %%mm1, %%mm0 \n\t"     /* Add product and carry ... */
        "paddq %%mm2, %%mm0 \n\t"     /* ... and digit of a */
        "movd %%mm0, (%0) \n\t"       /* Store low 32 bits of result */
        "psrlq $32, %%mm0 \n\t"       /* Next carry is high 32 bits result */
        "addl $4, %0\n\t"
        "addl $4, %1\n\t"
        "subl $1, %2\n\t"
        "jne 1b \n\t"
        "movd %%mm0, %3 \n\t"
        "emms"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
        : "m" (d));
  }
  if (alen == 0) return out;
  /* current digit of a += out */
  BngAdd2(*a, carry, *a, out);
  a++;
  alen--;
  /* Propagate carry */
  if (carry == 0 || alen == 0) return carry;
  do {
    if (++(*a) != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

static bngdigit bng_ia32sse2_mult_sub_digit
     (bng a/*[alen]*/, bngsize alen,
      bng b/*[blen]*/, bngsize blen,
      bngdigit d)
{
  static unsigned long long bias1 = 0xFFFFFFFF00000000ULL - 0xFFFFFFFFULL;
  static unsigned long bias2 = 0xFFFFFFFFUL;
  bngdigit out;
  bngcarry carry;

  alen -= blen;
  out = 0;
  if (blen > 0) {
    /* Carry C is represented by ENC(C) = 0xFFFFFFFF - C (one's complement) */
    asm("movd %6, %%mm0 \n\t"         /* MM0 is carry (initially 0xFFFFFFFF) */
        "movq %5, %%mm6 \n\t"         /* MM6 is magic constant bias1 */
        "movd %4, %%mm7 \n\t"         /* MM7 is digit d */
        "1: \n\t"
        "movd (%0), %%mm1 \n\t"       /* MM1 is next digit of a */
        "movd (%1), %%mm2 \n\t"       /* MM2 is next digit of b */
        "paddq %%mm6, %%mm1 \n\t"     /* bias digit of a */
        "pmuludq %%mm7, %%mm2 \n\t"   /* MM2 = d * digit of b */
        /* Compute
           digit of a + ENC(carry) + 0xFFFFFFFF00000000 - 0xFFFFFFFF - product
           = digit of a - carry + 0xFFFFFFFF00000000 - product
           = digit of a - carry - productlow + (ENC(nextcarry) << 32) */
        "psubq %%mm2, %%mm1 \n\t"
        "paddq %%mm1, %%mm0 \n\t"
        "movd %%mm0, (%0) \n\t"       /* Store low 32 bits of result */
        "psrlq $32, %%mm0 \n\t"       /* Next carry is 32 high bits of result */
        "addl $4, %0\n\t"
        "addl $4, %1\n\t"
        "subl $1, %2\n\t"
        "jne 1b \n\t"
        "movd %%mm0, %3 \n\t"
        "emms"
        : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
        : "m" (d), "m" (bias1), "m" (bias2));
    out = ~out; /* Undo encoding on out digit */
  }
  if (alen == 0) return out;
  /* current digit of a -= out */
  BngSub2(*a, carry, *a, out);
  a++;
  alen--;
  /* Propagate carry */
  if (carry == 0 || alen == 0) return carry;
  do {
    if ((*a)-- != 0) return 0;
    a++;
  } while (--alen);
  return 1;
}

/* Detect whether SSE2 instructions are supported */

static int bng_ia32_sse2_supported(void)
{
  unsigned int flags, newflags, max_id, capabilities;

#define EFLAG_CPUID 0x00200000
#define CPUID_IDENTIFY 0
#define CPUID_CAPABILITIES 1
#define SSE2_CAPABILITY 26

  /* Check if processor has CPUID instruction */
  asm("pushfl \n\t"
      "popl %0"
      : "=r" (flags) : );
  newflags = flags ^ EFLAG_CPUID;   /* CPUID detection flag */
  asm("pushfl \n\t"
      "pushl %1 \n\t"
      "popfl \n\t"
      "pushfl \n\t"
      "popl %0 \n\t"
      "popfl"
      : "=r" (flags) : "r" (newflags));
  /* If CPUID detection flag cannot be changed, CPUID instruction is not
     available */
  if ((flags & EFLAG_CPUID) != (newflags & EFLAG_CPUID)) return 0;
  /* See if SSE2 extensions are supported */
  asm("pushl %%ebx \n\t"        /* need to preserve %ebx for PIC */
      "cpuid \n\t"
      "popl %%ebx"
      : "=a" (max_id) : "a" (CPUID_IDENTIFY): "ecx", "edx");
  if (max_id < 1) return 0;
  asm("pushl %%ebx \n\t"
      "cpuid \n\t"
      "popl %%ebx"
      : "=d" (capabilities) : "a" (CPUID_CAPABILITIES) : "ecx");
  return capabilities & (1 << SSE2_CAPABILITY);
}

#endif

static void bng_ia32_setup_ops(void)
{
#if BNG_ASM_LEVEL >= 2
  if (bng_ia32_sse2_supported()) {
    bng_ops.add = bng_ia32sse2_add;
    bng_ops.sub = bng_ia32sse2_sub;
    bng_ops.mult_add_digit = bng_ia32sse2_mult_add_digit;
    bng_ops.mult_sub_digit = bng_ia32sse2_mult_sub_digit;
    return;
  }
#endif
  bng_ops.add = bng_ia32_add;
  bng_ops.sub = bng_ia32_sub;
  bng_ops.mult_add_digit = bng_ia32_mult_add_digit;
  bng_ops.mult_sub_digit = bng_ia32_mult_sub_digit;
}

#define BNG_SETUP_OPS bng_ia32_setup_ops()