1383 lines
36 KiB
ArmAsm
1383 lines
36 KiB
ArmAsm
# Copyright Digital Equipment Corporation & INRIA 1988, 1989
|
|
# Last modified_on Tue Jul 31 17:48:45 GMT+2:00 1990 by shand
|
|
# modified_on Fri Mar 2 16:53:50 GMT+1:00 1990 by herve
|
|
#
|
|
# KerN for Mips
|
|
# Paul Zimmermann & Robert Ehrlich & Bernard Paul Serpette
|
|
# & Mark Shand
|
|
#
|
|
.text
|
|
.align 2
|
|
.globl BnnSetToZero
|
|
.ent BnnSetToZero # (nn nl)
|
|
BnnSetToZero:
|
|
.frame $sp, 0, $31
|
|
sll $9,$5,2 # nl <<= 2;
|
|
beq $5,$0,BSTZ2 # if(nl == 0) goto BSTZ2;
|
|
andi $8,$9,0x1c
|
|
lw $10,BSTZTable($8)
|
|
addu $9,$4 # nl += nn;
|
|
addu $4,$8
|
|
j $10
|
|
BSTZE8:
|
|
BSTZLoop: addu $4,32 # nn++;
|
|
sw $0,-32($4) # *nn = 0;
|
|
BSTZE7: sw $0,-28($4)
|
|
BSTZE6: sw $0,-24($4)
|
|
BSTZE5: sw $0,-20($4)
|
|
BSTZE4: sw $0,-16($4) # *nn = 0;
|
|
BSTZE3: sw $0,-12($4)
|
|
BSTZE2: sw $0,-8($4)
|
|
BSTZE1: sw $0,-4($4)
|
|
bne $4,$9,BSTZLoop # if(nn != nl) goto BSTZLoop;
|
|
BSTZ2: j $31 # return;
|
|
.rdata
|
|
BSTZTable:
|
|
.word BSTZE8
|
|
.word BSTZE1
|
|
.word BSTZE2
|
|
.word BSTZE3
|
|
.word BSTZE4
|
|
.word BSTZE5
|
|
.word BSTZE6
|
|
.word BSTZE7
|
|
.text
|
|
.end BnnSetToZero
|
|
|
|
.align 2
|
|
.globl BnnAssign
|
|
.ent BnnAssign # (mm nn nl)
|
|
BnnAssign:
|
|
.frame $sp, 0, $31
|
|
ble $4,$5,BAG2 # if(mm <= nn) goto BAG2;
|
|
sll $12,$6,2 # X = nl << 2;
|
|
addu $4,$12 # mm += X;
|
|
addu $5,$12 # nn += X;
|
|
b BAG4 # goto BAG4;
|
|
BAG1: lw $12,($5) # X = *(nn);
|
|
sw $12,($4) # *(mm) = X
|
|
addu $4,4 # mm++;
|
|
addu $5,4 # nn++;
|
|
subu $6,1 # nl--;
|
|
BAG2: bnez $6,BAG1 # if(nl) goto BAG1;
|
|
j $31 # return;
|
|
BAG3: subu $4,4 # mm--;
|
|
subu $5,4 # nn--;
|
|
lw $12,($5) # X = *(nn);
|
|
sw $12,($4) # *(mm) = X;
|
|
subu $6,1 # nl--;
|
|
BAG4: bnez $6,BAG3 # if(nl) goto BAG3;
|
|
j $31 # return;
|
|
.end BnAssign
|
|
|
|
.align 2
|
|
.globl BnnSetDigit
|
|
.ent BnnSetDigit # (nn d)
|
|
BnnSetDigit:
|
|
sw $5,0($4) # *nn = d;
|
|
j $31 # return;
|
|
.end BnnSetDigit
|
|
|
|
.align 2
|
|
.globl BnnGetDigit
|
|
.ent BnnGetDigit # (nn)
|
|
BnnGetDigit:
|
|
lw $2,0($4) # return(*nn);
|
|
j $31
|
|
.end BnnGetDigit
|
|
|
|
.align 2
|
|
.globl BnnNumDigits
|
|
.ent BnnNumDigits # (nn nl)
|
|
BnnNumDigits:
|
|
.frame $sp, 0, $31
|
|
sll $12,$5,2
|
|
addu $4,$12 # nn = &nn[nl];
|
|
b BND2 # goto BND2;
|
|
BND1: subu $5,1 # nl--;
|
|
subu $4,4 # nn--;
|
|
lw $12,0($4) # X = *nn;
|
|
bnez $12,BND3 # if(X) goto BND3;
|
|
BND2: bnez $5,BND1 # if(nl) goto BND1;
|
|
li $2,1 # return(1);
|
|
j $31
|
|
BND3: addu $2,$5,1 # return(nl);
|
|
j $31
|
|
.end BnnNumDigits
|
|
|
|
.align 2
|
|
.globl BnnNumLeadingZeroBitsInDigit
|
|
.ent BnnNumLeadingZeroBitsInDigit # (d)
|
|
BnnNumLeadingZeroBitsInDigit:
|
|
.frame $sp, 0, $31
|
|
move $2,$0 # p = 0;
|
|
bne $4,0,BLZ2 # if(!d) goto BLZ2;
|
|
li $2,32 # return(32);
|
|
j $31
|
|
BLZ1: addu $2,1 # p++;
|
|
sll $4,1 # d <<= 1;
|
|
BLZ2: bgtz $4,BLZ1 # while (d>0) goto BLZ1
|
|
j $31 # return(p);
|
|
.end BnnNumLeadingZeroBitsInDigit
|
|
|
|
.align 2
|
|
.globl BnnDoesDigitFitInWord
|
|
.ent BnnDoesDigitFitInWord # (d)
|
|
BnnDoesDigitFitInWord:
|
|
.frame $sp, 0, $31
|
|
li $2,1 # return(1);
|
|
j $31
|
|
.end BnnDoesDigitFitInWord
|
|
|
|
.align 2
|
|
.globl BnnIsDigitZero
|
|
.ent BnnIsDigitZero # (d)
|
|
BnnIsDigitZero:
|
|
.frame $sp, 0, $31
|
|
seq $2,$4,0 # return(d == 0);
|
|
j $31
|
|
.end BnnIsDigitZero
|
|
|
|
.align 2
|
|
.globl BnnIsDigitNormalized
|
|
.ent BnnIsDigitNormalized # (d)
|
|
BnnIsDigitNormalized:
|
|
.frame $sp, 0, $31
|
|
slt $2,$4,$0 # return(d < 0);
|
|
j $31
|
|
.end BnnIsDigitNormalized
|
|
|
|
.align 2
|
|
.globl BnnIsDigitOdd
|
|
.ent BnnIsDigitOdd # (d)
|
|
BnnIsDigitOdd:
|
|
.frame $sp, 0, $31
|
|
and $2,$4,1 # return(d & 1);
|
|
j $31
|
|
.end BnnIsDigitOdd
|
|
|
|
.align 2
|
|
.globl BnnCompareDigits
|
|
.ent BnnCompareDigits # (d1 d2)
|
|
BnnCompareDigits:
|
|
.frame $sp, 0, $31
|
|
# 254 return ((d1 > d2) - (d1 < d2));
|
|
sltu $8,$5,$4 # t0 = (d2 < d1);
|
|
sltu $9,$4,$5 # t1 = (d1 < d2);
|
|
sub $2,$8,$9 # return t0-t1;
|
|
j $31
|
|
.end BnnCompareDigits
|
|
|
|
.align 2
|
|
.globl BnnComplement
|
|
.ent BnnComplement # (nn nl)
|
|
BnnComplement:
|
|
.frame $sp, 0, $31
|
|
sll $8,$5,2 # bytes = nl*4;
|
|
beq $5,$0,BCM2 # if(nl == 0) goto BCM2;
|
|
add $8,$4 # lim = nn+bytes;
|
|
BCM1:
|
|
lw $14,0($4) # X = *nn;
|
|
nor $14,$0 # X ^= -1;
|
|
sw $14,0($4) # *nn = X
|
|
addu $4,4 # nn++;
|
|
bne $8,$4,BCM1 # if(nl != 0) goto BCM1;
|
|
BCM2: j $31 # return;
|
|
.end BnnComplement
|
|
|
|
.align 2
|
|
.globl BnnAndDigits
|
|
.ent BnnAndDigits # (nn d)
|
|
BnnAndDigits:
|
|
.frame $sp, 0, $31
|
|
lw $14,0($4) # X = *nn;
|
|
and $14,$5 # X &= d;
|
|
sw $14,0($4) # *nn = X;
|
|
j $31 # return;
|
|
.end BnnAndDigits
|
|
|
|
.align 2
|
|
.globl BnnOrDigits
|
|
.ent BnnOrDigits # (nn d)
|
|
BnnOrDigits:
|
|
.frame $sp, 0, $31
|
|
lw $14,0($4) # X = *nn;
|
|
or $14,$5 # X |= d;
|
|
sw $14,0($4) # *nn = X;
|
|
j $31 # return;
|
|
.end BnnOrDigits
|
|
|
|
.align 2
|
|
.globl BnnXorDigits
|
|
.ent BnnXorDigits # (nn d)
|
|
BnnXorDigits:
|
|
.frame $sp, 0, $31
|
|
lw $14,0($4) # X = *nn;
|
|
xor $14,$5 # X ^= d;
|
|
sw $14,0($4) # *nn = X;
|
|
j $31 # return;
|
|
.end BnnXorDigits
|
|
|
|
.align 2
|
|
.globl BnnShiftLeft
|
|
.ent BnnShiftLeft # (mm ml nbi)
|
|
BnnShiftLeft:
|
|
.frame $sp, 0, $31
|
|
move $2,$0 # res = 0;
|
|
beq $6,0,BSL2 # if(nbi == 0) goto BSL2;
|
|
li $14,32 # rnbi = 32;
|
|
subu $14,$6 # rnbi -= nbi;
|
|
beq $5,0,BSL2 # if(ml == 0) goto BSL2;
|
|
sll $15,$5,2 # bytes = 4*ml;
|
|
addu $15,$4 # lim = mm+size;
|
|
BSL1:
|
|
lw $25,0($4) # save = *mm;
|
|
sll $24,$25,$6 # X = save << nbi;
|
|
or $24,$2 # X |= res;
|
|
sw $24,0($4) # *mm = X;
|
|
addu $4,4 # mm++;
|
|
srl $2,$25,$14 # res = save >> rnbi;
|
|
bne $4,$15,BSL1 # if(mm != lim) goto BSL1;
|
|
BSL2: j $31 # return(res);
|
|
.end BnnShiftLeft
|
|
|
|
.align 2
|
|
.globl BnnShiftRight
|
|
.ent BnnShiftRight # (mm ml nbi)
|
|
BnnShiftRight:
|
|
.frame $sp, 0, $31
|
|
move $2,$0 # res = 0;
|
|
beq $6,0,BSR2 # if(nbi == 0) goto BSR2;
|
|
sll $14,$5,2 # bytes = ml*4;
|
|
beq $5,0,BSR2 # if(ml == 0) goto BSR2
|
|
addu $15,$4,$14 # lim = mm; mm += bytes;
|
|
li $14,32 # lnbi = 32;
|
|
subu $14,$6 # lnbi -= nbi;
|
|
BSR1:
|
|
subu $15,4 # mm--;
|
|
lw $25,0($15) # save = *mm;
|
|
srl $24,$25,$6 # X = save >> nbi;
|
|
or $24,$2 # X |= res
|
|
sw $24,0($15) # *mm = X;
|
|
sll $2,$25,$14 # res = save << lnbi;
|
|
bne $15,$4,BSR1 # if(mm != lim) goto BSR1;
|
|
BSR2: j $31 # return(res);
|
|
.end BnnShiftRight
|
|
|
|
.align 2
|
|
.globl BnnAddCarry
|
|
.ent BnnAddCarry # (nn nl car)
|
|
BnnAddCarry:
|
|
.frame $sp, 0, $31
|
|
beq $6,0,BAC3 # if(car == 0) return(0);
|
|
beq $5,0,BAC2 # if(nl == 0) return(1);
|
|
BAC1: subu $5,1 # nl--;
|
|
lw $9,0($4) # X = *nn;
|
|
addu $9,1 # X++;
|
|
sw $9,0($4) # *nn = X;
|
|
addu $4,4 # nn++;
|
|
bne $9,$0,BAC3 # if(X) goto BAC3;
|
|
bne $5,$0,BAC1 # if(nl) goto BAC1;
|
|
BAC2: li $2,1 # return(1);
|
|
j $31
|
|
BAC3: li $2,0 # return(0);
|
|
j $31
|
|
.end BnnAddCarry
|
|
|
|
.align 2
|
|
.globl BnnAdd
|
|
.ent BnnAdd # (mm ml nn nl car)
|
|
BnnAdd:
|
|
.frame $sp, 0, $31
|
|
lw $2, 16($sp) # c = carryin;
|
|
subu $5,$7 # ml -= nl;
|
|
bne $7,$0,BADD1 # if(nl) goto BADD1;
|
|
bne $2,$0,BADD2 # if(c) goto BADD2;
|
|
BADD0: j $31 # return(c)
|
|
BADD1a: # carry, save == 0
|
|
# hence (*nn == 0 && carry == 0) || (*nn == -1 && carry == 1)
|
|
# in either case, *mm++ += 0; carry is preserved
|
|
addu $4,4 # mm++;
|
|
beq $7,$0,BADD2
|
|
BADD1: subu $7,1 # nl--;
|
|
lw $15,0($6) # save = *nn;
|
|
addu $6,4 # nn++;
|
|
addu $15,$2 # save += c;
|
|
beq $15,$0,BADD1a # if (save == 0);
|
|
# no carry
|
|
lw $10,0($4) # X = *mm;
|
|
addu $4,4 # mm++;
|
|
addu $10,$15 # X += save;
|
|
sw $10,-4($4) # mm[-1] = X
|
|
sltu $2,$10,$15 # c = (X < save);
|
|
bne $7,$0,BADD1 # if(nl) goto BADD1;
|
|
|
|
BADD2: beq $5,0,BADD0 # if(ml == 0) return(c);
|
|
beq $2,0,BADD0 # if(c == 0) return(0);
|
|
BADD3: subu $5,1 # ml--;
|
|
lw $9,0($4) # X = *mm;
|
|
addu $9,1 # X++;
|
|
sw $9,0($4) # *mm = X;
|
|
addu $4,4 # mm++;
|
|
bne $9,$0,BADD4 # if(X) return(0);
|
|
bne $5,$0,BADD3 # if(ml) goto BADD3;
|
|
j $31 # return(1);
|
|
BADD4: move $2,$0 # return(0)
|
|
j $31
|
|
.end BnnAdd
|
|
|
|
.align 2
|
|
.globl BnnSubtractBorrow
|
|
.ent BnnSubtractBorrow # (nn nl car)
|
|
BnnSubtractBorrow:
|
|
.frame $sp, 0, $31
|
|
bne $6,0,BSB3 # if(car) return(1);
|
|
beq $5,0,BSB2 # if(nl == 0) return(0);
|
|
BSB1: subu $5,1 # nl--;
|
|
lw $9,0($4) # X = *nn;
|
|
subu $10,$9,1 # Y = X - 1;
|
|
sw $10,0($4) # *nn = Y;
|
|
addu $4,4 # nn++;
|
|
bne $9,$0,BSB3 # if(X) return(1);
|
|
bne $5,$0,BSB1 # if(nl) goto BSB1;
|
|
BSB2: li $2,0 # return(0);
|
|
j $31
|
|
BSB3: li $2,1 # return(1);
|
|
j $31
|
|
.end BnnSubtractBorrow
|
|
|
|
.align 2
|
|
.globl BnnSubtract
|
|
.ent BnnSubtract 2 # (mm ml nn nl car)
|
|
BnnSubtract:
|
|
.frame $sp, 0, $31
|
|
subu $5,$7 # ml -= nl;
|
|
lw $2, 16($sp) # car;
|
|
xor $14,$2,1 # c = !car
|
|
bne $7,$0,BS1 # if(nl) goto BS1;
|
|
bne $2,$0,BS0 # if(!c) goto BS0
|
|
bne $5,$0,BSB1 # if (ml != 0) goto Borrow
|
|
BS0: j $31 # $r2 == 1; return(1)
|
|
BS1a: # sub == 0
|
|
# hence (*nn == 0 && carry == 0) || (*nn == -1 && carry == 1)
|
|
# in either case, *mm++ -= 0; carry is preserved
|
|
addu $4,4
|
|
beq $7,$0,BS2
|
|
BS1: subu $7,1 # nl--;
|
|
lw $12,0($6) # sub = *nn;
|
|
addu $6,4 # nn++;
|
|
addu $12,$14 # sub += c;
|
|
beq $12,$0,BS1a # if(sub == 0) goto BS1a
|
|
lw $15,0($4) # X = *mm
|
|
addu $4,4 # mm++;
|
|
subu $10,$15,$12 # Y = X-sub (sub != 0)
|
|
sw $10,-4($4) # *mm = Y
|
|
sltu $14,$15,$10 # c = (Y > X) (note: X != Y)
|
|
bne $7,$0,BS1 # if(nl) goto BS1;
|
|
|
|
BS2: beq $14,$0,BS3 # if (!c) return (!c)
|
|
bne $5,$0,BSB1 # if (ml != 0) goto Borrow
|
|
BS3: xor $2,$14,1 # return(!c);
|
|
j $31
|
|
.end BnnSubtract
|
|
|
|
.align 2
|
|
.globl BnnMultiplyDigit
|
|
.ent BnnMultiplyDigit # (pp pl mm ml d)
|
|
BnnMultiplyDigit:
|
|
.frame $sp, 0, $31
|
|
lw $8, 16($sp) # d;
|
|
move $9,$0 # low = 0;
|
|
li $2,1 # load 1 for comparison
|
|
beq $8,0,BMD7 # if(d == 0) return(0);
|
|
move $10,$0 # carry1 = 0
|
|
bne $8,$2,BMDFastLinkage # if(d!=1)goto BMDFastLinkage;
|
|
sw $0, 16($sp)
|
|
b BnnAdd # BnnAdd(pp, pl, mm, ml, 0);
|
|
|
|
# FastLinkage entry point takes 5th parameter in r8
|
|
# and two extra parameters in r9,r10 which must add to
|
|
# less than 2^32 and that are added to pp[0]
|
|
# used from BnnMultiply squaring code.
|
|
BMDFastLinkage:
|
|
subu $5,$7 # pl -= ml;
|
|
move $11,$0 # inc = 0
|
|
# move $15,$0 save = 0; logically needed, but use is
|
|
# such that we can optimize out
|
|
beq $7,$0,BMD6 # if(ml==0) goto BMD6;
|
|
sll $7,$7,2 # ml *= 4;
|
|
addu $7,$7,$6 # ml = &mm[ml]
|
|
BMD3: lw $13,0($6) # X = *mm;
|
|
addu $6,4 # mm++;
|
|
multu $13,$8 # HI-LO = X * d;
|
|
sltu $12,$15,$11 # carry2 = (save < inc)
|
|
lw $15,0($4) # save = *pp;
|
|
addu $9,$10 # low += carry1;
|
|
addu $9,$12 # low += carry2;
|
|
addu $15,$15,$9 # save = save + low;
|
|
sltu $10,$15,$9 # carry1 = (save < low)
|
|
addu $4,4 # pp++;
|
|
mflo $11 # inc = LO;
|
|
mfhi $9 # low = HI;
|
|
addu $15,$11 # save += inc;
|
|
sw $15,-4($4) # *pp = save;
|
|
bne $7,$6,BMD3 # if(mm != ml) goto BMD3;
|
|
BMD6: sltu $12,$15,$11 # carry2 = (save < inc)
|
|
lw $15,($4) # save = *pp;
|
|
addu $9,$10 # low += carry1;
|
|
addu $9,$12 # low += carry2;
|
|
addu $9,$15 # low += save;
|
|
sw $9,0($4) # *pp = low;
|
|
addu $4,4 # pp++;
|
|
bltu $9,$15,BMD8 # if(low < save) goto BMD8;
|
|
BMD7: move $2, $0 # return(0);
|
|
j $31
|
|
BMD8: subu $5,1 # pl--;
|
|
beq $5,0,BMD10 # if(ml == 0) return(1);
|
|
BMD9: subu $5,1 # pl--;
|
|
lw $9,0($4) # X = *pp;
|
|
addu $9,1 # X++;
|
|
sw $9,0($4) # *pp = X;
|
|
addu $4,4 # pp++;
|
|
bne $9,$0,BMD7 # if(X) return(0);
|
|
bne $5,$0,BMD9 # if(pl) goto BMD9;
|
|
BMD10: li $2,1 # return(1);
|
|
j $31
|
|
.end BnnMultiplyDigit
|
|
|
|
.align 2
|
|
.globl BnnDivideDigit
|
|
.ent BnnDivideDigit # (qq nn nl d)
|
|
BnnDivideDigit:
|
|
.frame $sp, 0, $31
|
|
move $11,$31
|
|
move $10,$4
|
|
move $9,$5
|
|
move $8,$6
|
|
move $4,$7 # k = BnnNumLeadingZeroBitsInDigit(d);
|
|
jal BnnNumLeadingZeroBitsInDigit
|
|
move $6,$2
|
|
beq $6,$0,BDD1 # if(k == 0) goto BDD1;
|
|
move $4,$9
|
|
move $5,$8
|
|
jal BnnShiftLeft # BnnShiftLeft(nn, nl, k);
|
|
lw $31,0($10) # first_qq = *qq;
|
|
move $5,$8 # o_nl = nl;
|
|
sll $7,$6 # d <<= k;
|
|
BDD1: sll $3,$8,2
|
|
addu $9,$3 # nn = &nn[nl];
|
|
subu $8,1 # nl--;
|
|
subu $3,4
|
|
addu $10,$3 # qq = &qq[nl];
|
|
srl $25,$7,16 # ch = HIGH(d);
|
|
and $2,$7,65535 # cl = LOW(d);
|
|
subu $9,4 # nn--;
|
|
lw $13,0($9) # rl = *nn;
|
|
beq $8,0,BDDx # if(nl == 0) goto BDDx;
|
|
BDD2: subu $8,1 # nl--;
|
|
move $12,$13 # rh = rl;
|
|
subu $9,4 # nn--;
|
|
lw $13,0($9) # rl = *nn;
|
|
divu $14,$12,$25 # qa = rh/ch;
|
|
multu $2,$14 # HI-LO = cl * qa;
|
|
mflo $24 # pl = LO;
|
|
multu $25,$14 # HI-LO = ch * qa;
|
|
mflo $15 # ph = LO;
|
|
srl $3,$24,16 # X = HIGH(pl);
|
|
addu $15,$3 # ph += X;
|
|
sll $24,16 # pl = L2H(pl);
|
|
bgtu $15,$12,BDD84 # if(ph > rh) goto BDD84;
|
|
bne $15,$12,BDD88 # if(ph != rh) goto BDD88;
|
|
bleu $24,$13,BDD88 # if(pl <= rl) goto BDD88;
|
|
BDD84: sll $3,$2,16 # X = L2H(cl);
|
|
BDD85: subu $14,1 # qa--;
|
|
bleu $3,$24,BDD86 # if(X <= pl) goto BDD86;
|
|
subu $15,1 # ph--;
|
|
BDD86: subu $24,$3 # pl -= X;
|
|
subu $15,$25 # ph -= ch;
|
|
bgtu $15,$12,BDD85 # if(ph > rh) goto BDD85;
|
|
bne $15,$12,BDD88 # if(ph != rh) goto BDD88;
|
|
bgtu $24,$13,BDD85 # if(pl > rl) goto BDD85;
|
|
BDD88: bleu $24,$13,BDD89 # if(pl <= rl) goto BDD89;
|
|
subu $12,1 # rh--;
|
|
BDD89: subu $13,$24 # rl -= pl;
|
|
subu $12,$15 # rh -= ph;
|
|
subu $10,4 # qq--;
|
|
sll $3,$14,16 # X = L2H(qa);
|
|
sw $3,0($10) # *qq = X;
|
|
sll $3,$12,16 # X = L2H(rh);
|
|
srl $14,$13,16 # qa = HIGH(rl);
|
|
or $14,$3 # qa |= X;
|
|
divu $14,$14,$25 # qa /= ch;
|
|
multu $2,$14 # HI-LO = cl * qa;
|
|
mflo $24 # pl = LO;
|
|
multu $25,$14 # HI-LO = ch * qa;
|
|
mflo $15 # ph = LO;
|
|
srl $3,$24,16 # X = HIGH(pl);
|
|
addu $15,$3 # ph += X;
|
|
and $24,65535 # pl = LOW(pl);
|
|
and $3,$15,65535 # X = LOW(ph);
|
|
sll $3,16 # X = L2H(X)
|
|
or $24,$3 # pl |= X;
|
|
srl $15,16 # ph = HIGH(ph);
|
|
bgtu $15,$12,BDD41 # if(ph > rh) goto BDD841;
|
|
bne $15,$12,BDD44 # if(ph != rh) goto BDD44;
|
|
bleu $24,$13,BDD44 # if(pl <= rl) goto BDD44;
|
|
BDD41: subu $14,1 # qa--;
|
|
bleu $7,$24,BDD42 # if(d <= pl) goto BDD42;
|
|
subu $15,1 # ph--;
|
|
BDD42: subu $24,$7 # pl -= d;
|
|
bgtu $15,$12,BDD41 # if(ph > rh) goto BDD841;
|
|
bne $15,$12,BDD44 # if(ph == rh) goto BDD44;
|
|
bgtu $24,$13,BDD41 # if(pl > rl) goto BDD41;
|
|
BDD44: subu $13,$24 # rl -= pl;
|
|
lw $3,0($10) # X = *qq;
|
|
or $3,$14 # X |= qa
|
|
sw $3,0($10) # *qq = X;
|
|
bne $8,0,BDD2
|
|
BDDx: beq $6,0,BDD46 # if(k = 0) goto BDD46;
|
|
bleu $10,$9,BDD45 # if(qq < nn) goto BDD45;
|
|
sll $3,$5,2
|
|
addu $3,$9 # X = &nn[o_nl];
|
|
bleu $3,$10,BDD45 # if(X <= qq) goto BDD45;
|
|
subu $5,$10,$9 # o_nl = qq - nn;
|
|
srl $5,2 # o_nl >>= 2;
|
|
lw $8,0($10) # X = *qq;
|
|
sw $31,0($10) # *qq = first_qq;
|
|
addu $5,1 # o_nl++;
|
|
move $4,$9 # BnnShiftRight(nn, o_nl, k);
|
|
jal BnnShiftRight
|
|
sw $8,0($10) # X = *qq;
|
|
srl $2,$13,$6 # return(rl >> k);
|
|
j $11
|
|
BDD45: bne $10,$9,BDD451 # if(qq == nn) goto BDD451;
|
|
subu $5,1 # o_nl--;
|
|
sll $5,2
|
|
addu $9,$5 # nn = &nn[o_nl];
|
|
li $5,1 # o_nl = 1;
|
|
BDD451: move $4,$9 # BnnShiftRight(nn, o_nl, k);
|
|
jal BnnShiftRight
|
|
BDD46: srl $2,$13,$6 # return(rl >> k);
|
|
j $11
|
|
.end BnnDivideDigit
|
|
|
|
#############################################################################
|
|
# Karatsuba Multiplication for Mips.
|
|
# Mark Shand & Jean Vuillemin, May 1989.
|
|
#
|
|
# Basic operation is to compute: (a1.B + a0) * (b1.B + b0)
|
|
# B is the base; a1,a0,b1,b0 <= B-1
|
|
# We compute PL = a0.b0
|
|
# PM = (a1-a0).(b0-b1)
|
|
# PH = a1.b1
|
|
# Then:
|
|
# (a1.B + a0) * (b1.B + b0) = PL + B.(PM+PL+PH) + B.B.PH
|
|
#
|
|
# Overall operation is BigNum mm * d0_d1.
|
|
# Each cycle computes m0_m1 * d0_d1
|
|
# to avoid underflow in (a1-a0) and (b0-b1) and the
|
|
# extra adds that it would entail, the main loop is
|
|
# broken into four variants:
|
|
# BM2DLLoop d0 >= d1, m0 <= m1
|
|
# BM2DNLLoop d0 >= d1, m0 > m1
|
|
# BM2DHLoop d0 < d1, m0 >= m1
|
|
# BM2DNHLoop d0 < d1, m0 < m1
|
|
# mm is assumed to be even length.
|
|
#
|
|
# The code within the loops is written on the assumption of an
|
|
# infinite supply of registers. Each name is used in a single
|
|
# assignment. Name are then assigned to the finite set of registers
|
|
# based on an analysis of lifetime of each name--this is the purpose
|
|
# of the "defines" at the start of the routine.
|
|
|
|
.align 2
|
|
.globl BnnMultiply2Digit
|
|
.globl BnnM2DFastLink
|
|
#define c0 $2 /* low carry */
|
|
#define tb1 $2
|
|
#define tc1 $2
|
|
#define tj1 $2
|
|
#define tn1 $2
|
|
#define tq1 $2
|
|
#define tz1 $2
|
|
#define tA2 $2
|
|
#define c1 $3 /* high carry */
|
|
#define th2 $3
|
|
#define ti2 $3
|
|
#define pH3 $3
|
|
#define tx3 $3
|
|
#define ty3 $3
|
|
#define ss $4
|
|
#define sl $5
|
|
#define mm $6
|
|
#define ml $7
|
|
#define mlim $7
|
|
#define d0 $8
|
|
#define d1 $9
|
|
#define ds $10 /* d0+d1 mod base */
|
|
#define t_z $11
|
|
#define tC3 $11
|
|
#define s0 $11
|
|
#define ta0 $11
|
|
#define td0 $11
|
|
#define te1 $11
|
|
#define tf1 $11
|
|
#define s1 $11
|
|
#define to2 $11
|
|
#define tp2 $11
|
|
#define ts2 $11
|
|
#define pM1 $11
|
|
#define m0 $12
|
|
#define ms $12 /* b0+b1 mod base */
|
|
#define tr2 $12
|
|
#define tu3 $12
|
|
#define tv3 $12
|
|
#define pL0 $13
|
|
#define tg1 $13
|
|
#define tk2 $13
|
|
#define tm2 $13
|
|
#define tt2 $13
|
|
#define tw2 $13
|
|
#define t_1 $14
|
|
#define pL1 $14
|
|
#define pH2 $14
|
|
#define pM2 $14
|
|
#define tB2 $14
|
|
#define m1 $15
|
|
#define borrow $15
|
|
# Special "friends" entry point--allows fast non-standard procedure linkage.
|
|
# Permits passing d0:d1 in r8-r9 and a low-order 64-bit integer in r2-r3
|
|
# that is added to final result.
|
|
# Used from BnnMultiply and most highly optimized version of PRL's RSA
|
|
# implemenatation.
|
|
.ent BnnM2DFastLink
|
|
BnnM2DFastLink:
|
|
.frame $sp, 0, $31
|
|
subu sl,ml
|
|
blez ml,BM2D6
|
|
lw m0,0(mm)
|
|
b BnnM2DFLAux
|
|
.end BnnM2DFastLink # (ss sl mm ml d0, d1)
|
|
|
|
.ent BnnMultiply2Digit # (ss sl mm ml d0, d1)
|
|
BnnMultiply2Digit:
|
|
.frame $sp, 0, $31
|
|
|
|
.set noreorder
|
|
lw d0, 16($sp) # d0;
|
|
lw d1, 20($sp) # d1;
|
|
li c0,0
|
|
li c1,0
|
|
blez ml,BM2D6 # if(ml <= 0) goto end_loop;
|
|
# BDSLOT
|
|
subu t_1,d0,1 # t_1 = d0-1
|
|
.set reorder
|
|
or t_z,d0,d1 # t_z = (d0 | d1)
|
|
beq t_z,0,BM2D7 # if(d0.d1 == 0)
|
|
# return(0);
|
|
lw m0,0(mm)
|
|
or t_1,d1 # t_1 = (d0-1)|d1
|
|
subu sl,ml # sl -= ml;
|
|
beq t_1,0,BM2DADD0 # if(d0.d1 != 1)
|
|
# BnnAdd(pp, pl, mm, ml, 0);
|
|
.set noreorder
|
|
BnnM2DFLAux:
|
|
multu d0,m0
|
|
#define EnableOddLength 1
|
|
#ifdef EnableOddLength
|
|
#define t_odd $15
|
|
#define t_a $15
|
|
#define t_b $14
|
|
#define t_c $15
|
|
#define t_d $15
|
|
#define t_e $14
|
|
#define t_f $13
|
|
#define t_g $15
|
|
# the ifdef'ed code handles case when length of mm is odd.
|
|
and t_odd,ml,1
|
|
sll mlim,ml,2 # ml *= 4;
|
|
beq t_odd,$0,BM2DmlEven
|
|
addu mlim,mlim,mm # mlim = mm+ml;
|
|
lw s0,0(ss)
|
|
addu mm,4
|
|
addu ss,4
|
|
mflo t_a
|
|
mfhi t_b
|
|
addu s0,t_a,s0
|
|
sltu t_c,s0,t_a
|
|
multu d1,m0
|
|
lw m0,0(mm)
|
|
addu t_d,t_c,t_b
|
|
mflo t_e
|
|
mfhi t_f
|
|
addu c0,t_e,t_d
|
|
sltu t_g,c0,t_e
|
|
multu d0,m0
|
|
addu c1,t_g,t_f
|
|
beq mm,mlim,BM2D6
|
|
# BDSLOT
|
|
sw s0,-4(ss)
|
|
|
|
BM2DmlEven:
|
|
#else EnableOddLength
|
|
sll mlim,ml,2 # ml *= 4;
|
|
addu mlim,mlim,mm # mlim = mm+ml;
|
|
#endif EnableOddLength
|
|
lw m1,4(mm) # ml *= 4;
|
|
bltu d0,d1,BM2DHighBig # expands to 2 instructions
|
|
# BDSLOT
|
|
nop
|
|
bltu m1,m0,BM2DLNeg # expands to 2 instructions
|
|
# BDSLOT
|
|
subu ds,d0,d1
|
|
b BM2DLPEntry
|
|
# BDSLOT
|
|
lw s0,0(ss)
|
|
|
|
BM2DLLoop:
|
|
lw m0,0(mm)
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
multu m0,d0
|
|
addu ss,8
|
|
sltu tA2,tz1,pM1
|
|
addu tB2,pM2,tA2 # tB2 = pM2 + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu c0,tB2,tw2 # c0 = pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
lw m1,4(mm)
|
|
sltu tC3,c0,tB2
|
|
bltu m1,m0,BM2DLNeg # expands to 2 instructions
|
|
# BDSLOT
|
|
addu c1,ty3,tC3
|
|
|
|
BM2DLPos:
|
|
lw s0,0(ss)
|
|
BM2DLPEntry:
|
|
subu ms,m1,m0
|
|
addu ta0,s0,c0 # ta0 = (s0+c0)%B
|
|
mfhi pL1
|
|
mflo pL0
|
|
sltu tb1,ta0,c0
|
|
addu tc1,pL1,tb1 # tc1 = pL1 + (s0+c0)/B
|
|
multu m1,d1
|
|
addu td0,pL0,ta0 # td0 = (pL0+s0+c0)%B
|
|
sw td0,0(ss) # (pL0+s0+c0)%B -> ss[0] FIN
|
|
sltu te1,td0,pL0
|
|
addu tf1,tc1,te1 # tf1 = pL1 + (pL0+s0+c0)/B
|
|
addu tg1,pL0,c1 # tg1 = (pL0+c1)%B
|
|
sltu th2,tg1,c1
|
|
addu ti2,pL1,th2 # ti2 = pL1 + (pL0+c1)/B
|
|
addu tj1,tg1,tf1 # tj1 = (pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tk2,tj1,tg1
|
|
lw s1,4(ss)
|
|
addu tm2,ti2,tk2 # tm2 = pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B
|
|
mfhi pH3
|
|
mflo pH2
|
|
addu tn1,tj1,s1 # tn1 = (s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu to2,tn1,s1
|
|
multu ms,ds
|
|
addu tp2,pH3,to2 # tp2 = pH3 + (s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tq1,pH2,tn1 # tq1 = (pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tr2,tq1,pH2
|
|
addu ts2,tp2,tr2 # ts2 = pH3 + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tt2,pH2,tm2 # tt2 = (pH2+pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B
|
|
sltu tu3,tt2,pH2
|
|
addu tv3,pH3,tu3 # tv3 = pH3 + (pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu tw2,ts2,tt2 # tw2 = pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tx3,tw2,ts2
|
|
addu ty3,tv3,tx3 # ty3 = pH3 + (pH3+(pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu mm,8
|
|
mflo pM1
|
|
mfhi pM2
|
|
bne mlim,mm,BM2DLLoop # if(mm!=mlim) goto BM2DLLoop;
|
|
# BDSLOT
|
|
addu tz1,pM1,tq1 # tz1 = (pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
|
|
.set reorder
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
sltu tA2,tz1,pM1
|
|
addu tB2,pM2,tA2 # tB2 = pM2 + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu c0,tB2,tw2 # c0 = pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tC3,c0,tB2
|
|
addu c1,ty3,tC3
|
|
b BM2D6
|
|
.set noreorder
|
|
|
|
BM2DNLLoop:
|
|
lw m0,0(mm)
|
|
subu tz1,tq1,pM1 # tz1 = (-pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
multu m0,d0
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
addu tB2,pM2,borrow
|
|
sltu tC3,tw2,tB2
|
|
lw m1,4(mm)
|
|
subu c0,tw2,tB2 # c0 = -pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
bgeu m1,m0,BM2DLPos # expands to 2 instructions
|
|
# BDSLOT
|
|
subu c1,ty3,tC3
|
|
|
|
BM2DLNeg:
|
|
lw s0,0(ss)
|
|
subu ms,m0,m1
|
|
addu ta0,s0,c0 # ta0 = (s0+c0)%B
|
|
mfhi pL1
|
|
mflo pL0
|
|
sltu tb1,ta0,c0
|
|
addu tc1,pL1,tb1 # tc1 = pL1 + (s0+c0)/B
|
|
multu m1,d1
|
|
addu td0,pL0,ta0 # td0 = (pL0+s0+c0)%B
|
|
sw td0,0(ss) # (pL0+s0+c0)%B -> ss[0] FIN
|
|
sltu te1,td0,pL0
|
|
addu tf1,tc1,te1 # tf1 = pL1 + (pL0+s0+c0)/B
|
|
addu tg1,pL0,c1 # tg1 = (pL0+c1)%B
|
|
sltu th2,tg1,c1
|
|
addu ti2,pL1,th2 # ti2 = pL1 + (pL0+c1)/B
|
|
addu tj1,tg1,tf1 # tj1 = (pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tk2,tj1,tg1
|
|
lw s1,4(ss)
|
|
addu tm2,ti2,tk2 # tm2 = pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B
|
|
mfhi pH3
|
|
mflo pH2
|
|
addu tn1,tj1,s1 # tn1 = (s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu to2,tn1,s1
|
|
multu ms,ds
|
|
addu tp2,pH3,to2 # tp2 = pH3 + (s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tq1,pH2,tn1 # tq1 = (pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tr2,tq1,pH2
|
|
addu ts2,tp2,tr2 # ts2 = pH3 + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tt2,pH2,tm2 # tt2 = (pH2+pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B
|
|
sltu tu3,tt2,pH2
|
|
addu tv3,pH3,tu3 # tv3 = pH3 + (pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu tw2,ts2,tt2 # tw2 = pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tx3,tw2,ts2
|
|
addu ty3,tv3,tx3 # ty3 = pH3 + (pH3+(pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
# Subtract ds
|
|
# sltu borrow,tw2,ds
|
|
# subu tw2,ds
|
|
# subu ty3,borrow
|
|
# End Subtract
|
|
addu mm,8
|
|
mflo pM1
|
|
mfhi pM2
|
|
bne mlim,mm,BM2DNLLoop # if(mm!=mlim) goto BM2DNLLoop;
|
|
# BDSLOT
|
|
sltu borrow,tq1,pM1
|
|
|
|
.set reorder
|
|
subu tz1,tq1,pM1 # tz1 = (-pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
addu tB2,pM2,borrow
|
|
sltu tC3,tw2,tB2
|
|
subu c0,tw2,tB2 # c0 = -pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
subu c1,ty3,tC3
|
|
b BM2D6
|
|
.set noreorder
|
|
BM2DHighBig:
|
|
bltu m0,m1,BM2DHNeg # expands to 2 instructions
|
|
subu ds,d1,d0
|
|
# BDSLOT
|
|
b BM2DHEntry
|
|
# BDSLOT
|
|
lw s0,0(ss)
|
|
|
|
BM2DHLoop:
|
|
lw m0,0(mm)
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
multu m0,d0
|
|
addu ss,8
|
|
sltu tA2,tz1,pM1
|
|
addu tB2,pM2,tA2 # tB2 = pM2 + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu c0,tB2,tw2 # c0 = pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
lw m1,4(mm)
|
|
sltu tC3,c0,tB2
|
|
bltu m0,m1,BM2DHNeg # expands to 2 instructions
|
|
# BDSLOT
|
|
addu c1,ty3,tC3
|
|
|
|
BM2DHPos:
|
|
lw s0,0(ss)
|
|
BM2DHEntry:
|
|
subu ms,m0,m1
|
|
addu ta0,s0,c0 # ta0 = (s0+c0)%B
|
|
mfhi pL1
|
|
mflo pL0
|
|
sltu tb1,ta0,c0
|
|
addu tc1,pL1,tb1 # tc1 = pL1 + (s0+c0)/B
|
|
multu m1,d1
|
|
addu td0,pL0,ta0 # td0 = (pL0+s0+c0)%B
|
|
sw td0,0(ss) # (pL0+s0+c0)%B -> ss[0] FIN
|
|
sltu te1,td0,pL0
|
|
addu tf1,tc1,te1 # tf1 = pL1 + (pL0+s0+c0)/B
|
|
addu tg1,pL0,c1 # tg1 = (pL0+c1)%B
|
|
sltu th2,tg1,c1
|
|
addu ti2,pL1,th2 # ti2 = pL1 + (pL0+c1)/B
|
|
addu tj1,tg1,tf1 # tj1 = (pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tk2,tj1,tg1
|
|
lw s1,4(ss)
|
|
addu tm2,ti2,tk2 # tm2 = pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B
|
|
mfhi pH3
|
|
mflo pH2
|
|
addu tn1,tj1,s1 # tn1 = (s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu to2,tn1,s1
|
|
multu ms,ds
|
|
addu tp2,pH3,to2 # tp2 = pH3 + (s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tq1,pH2,tn1 # tq1 = (pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tr2,tq1,pH2
|
|
addu ts2,tp2,tr2 # ts2 = pH3 + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tt2,pH2,tm2 # tt2 = (pH2+pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B
|
|
sltu tu3,tt2,pH2
|
|
addu tv3,pH3,tu3 # tv3 = pH3 + (pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu tw2,ts2,tt2 # tw2 = pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tx3,tw2,ts2
|
|
addu ty3,tv3,tx3 # ty3 = pH3 + (pH3+(pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu mm,8
|
|
mflo pM1
|
|
mfhi pM2
|
|
bne mlim,mm,BM2DHLoop # if(mm!=mlim) goto BM2DHLoop;
|
|
# BDSLOT
|
|
addu tz1,pM1,tq1 # tz1 = (pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
|
|
.set reorder
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
sltu tA2,tz1,pM1
|
|
addu tB2,pM2,tA2 # tB2 = pM2 + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu c0,tB2,tw2 # c0 = pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tC3,c0,tB2
|
|
addu c1,ty3,tC3
|
|
b BM2D6
|
|
.set noreorder
|
|
|
|
BM2DNHLoop:
|
|
lw m0,0(mm)
|
|
subu tz1,tq1,pM1 # tz1 = (-pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
multu m0,d0
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
addu tB2,pM2,borrow
|
|
sltu tC3,tw2,tB2
|
|
lw m1,4(mm)
|
|
subu c0,tw2,tB2 # c0 = -pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
bgeu m0,m1,BM2DHPos # expands to 2 instructions
|
|
# BDSLOT
|
|
subu c1,ty3,tC3
|
|
|
|
BM2DHNeg:
|
|
lw s0,0(ss)
|
|
subu ms,m1,m0
|
|
addu ta0,s0,c0 # ta0 = (s0+c0)%B
|
|
mfhi pL1
|
|
mflo pL0
|
|
sltu tb1,ta0,c0
|
|
addu tc1,pL1,tb1 # tc1 = pL1 + (s0+c0)/B
|
|
multu m1,d1
|
|
addu td0,pL0,ta0 # td0 = (pL0+s0+c0)%B
|
|
sw td0,0(ss) # (pL0+s0+c0)%B -> ss[0] FIN
|
|
sltu te1,td0,pL0
|
|
addu tf1,tc1,te1 # tf1 = pL1 + (pL0+s0+c0)/B
|
|
addu tg1,pL0,c1 # tg1 = (pL0+c1)%B
|
|
sltu th2,tg1,c1
|
|
addu ti2,pL1,th2 # ti2 = pL1 + (pL0+c1)/B
|
|
addu tj1,tg1,tf1 # tj1 = (pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tk2,tj1,tg1
|
|
lw s1,4(ss)
|
|
addu tm2,ti2,tk2 # tm2 = pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B
|
|
mfhi pH3
|
|
mflo pH2
|
|
addu tn1,tj1,s1 # tn1 = (s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu to2,tn1,s1
|
|
multu ms,ds
|
|
addu tp2,pH3,to2 # tp2 = pH3 + (s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tq1,pH2,tn1 # tq1 = (pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sltu tr2,tq1,pH2
|
|
addu ts2,tp2,tr2 # ts2 = pH3 + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
addu tt2,pH2,tm2 # tt2 = (pH2+pL1 + (pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B
|
|
sltu tu3,tt2,pH2
|
|
addu tv3,pH3,tu3 # tv3 = pH3 + (pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
addu tw2,ts2,tt2 # tw2 = pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
sltu tx3,tw2,ts2
|
|
addu ty3,tv3,tx3 # ty3 = pH3 + (pH3+(pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B)/B+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)/B
|
|
# Subtract ds
|
|
# sltu borrow,tw2,ds
|
|
# subu tw2,ds
|
|
# subu ty3,borrow
|
|
# End Subtract
|
|
addu mm,8
|
|
mflo pM1
|
|
mfhi pM2
|
|
bne mlim,mm,BM2DNHLoop # if(mm!=mlim) goto BM2DHLoop;
|
|
# BDSLOT
|
|
sltu borrow,tq1,pM1
|
|
|
|
.set reorder
|
|
subu tz1,tq1,pM1 # tz1 = (-pM1+pH2+s1+pL1+pL0+c1 + (pL0+s0+c0)/B)%B
|
|
sw tz1,4(ss) # (pM1+pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B -> ss[1] FIN
|
|
addu ss,8
|
|
addu tB2,pM2,borrow
|
|
sltu tC3,tw2,tB2
|
|
subu c0,tw2,tB2 # c0 = -pM2+pH3+(pH2+pL1+(pL1+(pL0+s0+c0)/B+pL0+c1)/B)%B + (pM1+(pH2+s1+pL1+pL0+c1+(pL0+s0+c0)/B)%B+pH2+s1+(pL1+pL0+c1+(pL0+s0+c0)/B)%B)/B
|
|
subu c1,ty3,tC3
|
|
# b BM2D6
|
|
|
|
BM2D6:
|
|
lw s0,0(ss) # s0 = *ss;
|
|
addu c0,s0 # c0 = (c0+s0)%B
|
|
sltu $12,c0,s0 # r = (c0+s0)/B
|
|
lw s1,4(ss) #
|
|
sw c0,0(ss) # *ss = c0;
|
|
addu c1,s1 # c1 = (c1+s1)%B
|
|
sltu c0,c1,s1 # c0 = (c1+s1)/B
|
|
addu c1,$12 # c1 = (c1+s1+(c0+s0)/B)%B
|
|
sltu $12,c1,$12 # r = ((c1+s1)%B+(c0+s0)/B)/B
|
|
sw c1,4(ss)
|
|
addu c0,$12 # c0 = (c1+s1+(c0+s0)/B)/B
|
|
addu ss,8 # ss+=2;
|
|
bne c0,0,BM2D8 # if(c0) goto BM2D8;
|
|
BM2D7: li $2,0 # return(0);
|
|
j $31
|
|
BM2D8: subu $5,2 # sl-=2;
|
|
blez $5,BM2D10 # if(sl <= 0) return(1);
|
|
BM2D9: subu $5,1 # pl--;
|
|
lw $9,0($4) # X = *pp;
|
|
addu $9,1 # X++;
|
|
sw $9,0($4) # *ss = X;
|
|
bne $9,$0,BM2D7 # if(X) return(0);
|
|
addu $4,4 # ss++;
|
|
bgtz $5,BM2D9 # if(sl > 0) goto BM2D9;
|
|
BM2D10: li $2,1 # return(1);
|
|
j $31
|
|
|
|
#==============================================================================
|
|
|
|
BM2DADD0: li c0,0
|
|
BM2DADD1: subu $7,1 # nl--;
|
|
lw $15,0($4) # save = *mm;
|
|
addu $4,4 # mm++;
|
|
addu $15,$2 # save += c;
|
|
sltu $14,$15,$2 # c' = (save < c);
|
|
lw $10,0($6) # X = *nn;
|
|
addu $6,4 # nn++;
|
|
addu $10,$15 # X += save;
|
|
sw $10,-4($4) # mm[-1] = X
|
|
sltu $15,$10,$15 # save = (X < save);
|
|
addu $2,$15,$14 # c = c' + save;
|
|
bne $7,$0,BM2DADD1 # if(nl) goto BM2DADD1;
|
|
|
|
beq $5,0,BM2D10 # if(ml == 0) return(c);
|
|
beq $2,0,BM2DADD3 # if(c == 0) return(0);
|
|
BM2DADD2: subu $5,1 # ml--;
|
|
lw $9,0($4) # X = *mm;
|
|
addu $9,1 # X++;
|
|
sw $9,0($4) # *mm = X;
|
|
addu $4,4 # mm++;
|
|
bne $9,$0,BM2DADD3 # if(X) return(0);
|
|
bne $5,$0,BM2DADD2 # if(ml) goto BM2DADD2;
|
|
j $31 # return(1);
|
|
BM2DADD3: move $2,$0 # return(0)
|
|
j $31
|
|
#undef c0
|
|
#undef tb1
|
|
#undef tc1
|
|
#undef tj1
|
|
#undef tn1
|
|
#undef tq1
|
|
#undef tz1
|
|
#undef tA2
|
|
#undef c1
|
|
#undef th2
|
|
#undef ti2
|
|
#undef pH3
|
|
#undef tx3
|
|
#undef ty3
|
|
#undef ss
|
|
#undef sl
|
|
#undef mm
|
|
#undef ml
|
|
#undef mlim
|
|
#undef d0
|
|
#undef d1
|
|
#undef ds
|
|
#undef t_z
|
|
#undef tC3
|
|
#undef s0
|
|
#undef ta0
|
|
#undef td0
|
|
#undef te1
|
|
#undef tf1
|
|
#undef s1
|
|
#undef to2
|
|
#undef tp2
|
|
#undef ts2
|
|
#undef pM1
|
|
#undef m0
|
|
#undef ms
|
|
#undef tr2
|
|
#undef tu3
|
|
#undef tv3
|
|
#undef pL0
|
|
#undef tg1
|
|
#undef tk2
|
|
#undef tm2
|
|
#undef tt2
|
|
#undef tw2
|
|
#undef t_1
|
|
#undef pL1
|
|
#undef pH2
|
|
#undef pM2
|
|
#undef tB2
|
|
#undef m1
|
|
#undef borrow
|
|
#ifdef EnableOddLength
|
|
#undef t_odd
|
|
#undef t_a
|
|
#undef t_b
|
|
#undef t_c
|
|
#undef t_d
|
|
#undef t_e
|
|
#undef t_f
|
|
#undef t_g
|
|
#endif EnableOddLength
|
|
.end BnnMultiply2Digit
|
|
|
|
.align 2
|
|
.globl BnnMultiply
|
|
#.loc 2 40
|
|
# 40 {
|
|
.ent BnnMultiply 2
|
|
BnnMultiply:
|
|
subu $sp, 56
|
|
sw $31, 52($sp)
|
|
sw $22, 48($sp)
|
|
sd $20, 40($sp)
|
|
sd $18, 32($sp)
|
|
sd $16, 24($sp)
|
|
.mask 0x807F0000, -4
|
|
.frame $sp, 56, $31
|
|
move $17, $4
|
|
move $18, $5
|
|
move $21, $6
|
|
move $22, $7
|
|
lw $16, 72($sp)
|
|
lw $19, 76($sp)
|
|
#.loc 2 74
|
|
# 74 if (nl & 1)
|
|
and $14, $19, 1
|
|
bne $6, $16, $37
|
|
move $20, $0
|
|
beq $7, $19, $38
|
|
#.loc 2 73
|
|
# 73 c = 0;
|
|
$37:
|
|
move $20, $0
|
|
bne $14, $0, $32
|
|
b $33
|
|
$32:
|
|
#.loc 2 76
|
|
# 75 {
|
|
# 76 c += BnnMultiplyDigit (pp, pl, mm, ml, *nn);
|
|
move $4, $17
|
|
move $5, $18
|
|
move $6, $21
|
|
move $7, $22
|
|
lw $15, 0($16)
|
|
sw $15, 16($sp)
|
|
jal BnnMultiplyDigit
|
|
move $20, $2
|
|
#.loc 2 77
|
|
# 77 pp++, nn++, nl--, pl--;
|
|
addu $17, $17, 4
|
|
addu $16, $16, 4
|
|
addu $19, $19, -1
|
|
addu $18, $18, -1
|
|
#.loc 2 78
|
|
# 78 }
|
|
$33:
|
|
#.loc 2 79
|
|
# 79 if ((ml & 1) && nl)
|
|
and $24, $22, 1
|
|
beq $24, $0, $34
|
|
beq $19, 0, $34
|
|
#.loc 2 81
|
|
# 80 {
|
|
# 81 c += BnnMultiplyDigit (pp, pl, nn, nl, *mm);
|
|
move $4, $17
|
|
move $5, $18
|
|
move $6, $16
|
|
move $7, $19
|
|
lw $25, 0($21)
|
|
sw $25, 16($sp)
|
|
jal BnnMultiplyDigit
|
|
addu $20, $20, $2
|
|
#.loc 2 82
|
|
# 82 pp++, mm++, ml--, pl--;
|
|
addu $17, $17, 4
|
|
addu $21, $21, 4
|
|
addu $22, $22, -1
|
|
addu $18, $18, -1
|
|
#.loc 2 83
|
|
# 83 }
|
|
$34:
|
|
#.loc 2 84
|
|
# 84 while (nl > 0)
|
|
bleu $19, 0, $36
|
|
$35:
|
|
#.loc 2 86
|
|
# 85 {
|
|
# 86 c += BnnMultiply2Digit (pp, pl, mm, ml, nn[0], nn[1]);
|
|
move $4, $17
|
|
move $5, $18
|
|
move $6, $21
|
|
move $7, $22
|
|
lw $8, 0($16)
|
|
lw $9, 4($16)
|
|
li $2, 0
|
|
li $3, 0
|
|
jal BnnM2DFastLink
|
|
addu $20, $20, $2
|
|
#.loc 2 87
|
|
# 87 pp += 2, nn += 2, nl -= 2, pl -= 2;
|
|
addu $17, $17, 8
|
|
addu $16, $16, 8
|
|
addu $19, $19, -2
|
|
addu $18, $18, -2
|
|
#.loc 2 88
|
|
# 88 }
|
|
#.loc 2 88
|
|
bgtu $19, 0, $35
|
|
$36:
|
|
#.loc 2 89
|
|
# 89 return c;
|
|
move $2, $20
|
|
ld $16, 24($sp)
|
|
ld $18, 32($sp)
|
|
ld $20, 40($sp)
|
|
lw $22, 48($sp)
|
|
lw $31, 52($sp)
|
|
addu $sp, 56
|
|
j $31
|
|
$38:
|
|
# We no longer need r21, r22 since nn == mm && nl == ml
|
|
li $21, 0
|
|
beq $14, $0, $40 # if ((nl&1) == 0) goto $40
|
|
lw $21, 0($16) # r10 = d = *nn
|
|
multu $21, $21 # d*d
|
|
lw $12, 0($17) # r12 = *pp
|
|
addu $16, 4 # nn++
|
|
addu $8, $21, $21 # d2 = 2*d
|
|
addu $17, 8 # pp += 2
|
|
mflo $13 # d*d % 2^32
|
|
addu $13, $12 # r13 = new pp[0] = (*pp + d*d) % 2^32
|
|
sltu $10, $13, $12 # r10 = carry = (*pp + d*d) / 2^32
|
|
mfhi $9 # r9 = save = d*d / 2^32
|
|
subu $4, $17, 4 # arg1 = pp-1
|
|
subu $5, $18, 1 # arg2 = pl-1
|
|
subu $18, 2 # pl -= 2
|
|
subu $19, 1 # nl--
|
|
move $6, $16 # arg3 = nn
|
|
move $7, $19
|
|
sw $13, -8($17)
|
|
jal BMDFastLinkage # BnnMultiplyDigit(r4,r5,r6,r7,r8)+(r9+r10)%2^32
|
|
addu $20, $2
|
|
sra $21,31
|
|
$40:
|
|
# 84 while (nl > 0)
|
|
bleu $19, 0, $42
|
|
$41:
|
|
# 85 {
|
|
# compute d0:d1*d0:d1+p0:p1+c0:c1 -> p0:p1:c0:c1 with maximal overlap of
|
|
# single cycle instruction with multiplier operation.
|
|
#
|
|
# observe a*b+c+d <= 2^64-1 for all a,b,c,d < 2^32
|
|
# we can exploit this property to minimize carry tests
|
|
# Accordingly, computation can be organized as follows:
|
|
# d0*d0 -> l0:l1 d0*d1 -> m0:m1 d1*d1 -> h0:h1
|
|
#
|
|
# c0 c1 L1 M1
|
|
# p0 p1 M0 N1
|
|
# l0:l1 m0:m1 m0:m1 h0:h1
|
|
# ===== ===== ===== =====
|
|
# L0:L1 M0:M1 N0:N1 H0:H1
|
|
# -> P0 P1 C0:C1
|
|
#
|
|
lw $8, 0($16)
|
|
lw $15, 4($16)
|
|
multu $8, $8 # d0*d0
|
|
and $2, $8, $21 # c0 = d0*sgn(n[-1])
|
|
and $3, $15, $21 # c1 = d1*sgn(n[-1])
|
|
slt $22, $21, $0 # r22 = n[-1] < 0
|
|
sra $21, $15, 31
|
|
lw $10, 0($17) # r10 = p0
|
|
lw $11, 4($17) # r11 = p1
|
|
addu $17, 16 # pp += 4
|
|
addu $10, $2 # r10 = L(p0+c0)
|
|
sltu $2, $10, $2 # r2 = H(p0+c0)
|
|
addu $11, $3 # r11 = L(p1+c1)
|
|
sltu $3, $11, $3 # r3 = H(p1+c1)
|
|
# enough computation to prevent a stall
|
|
mflo $12 # l0
|
|
mfhi $13 # l1
|
|
addu $10, $12 # r10 = L0 = L(p0+c0+l0)
|
|
sw $10,-16($17) # pp[-4] = L0
|
|
multu $8, $15 # d0*d1
|
|
addu $16, 8 # nn += 2
|
|
sltu $12, $10, $12 # r12 = H(L(p0+c0)+l0)
|
|
# r12+r2 = H(p0+c0+l0)
|
|
addu $12, $13 # assert r12 == 0 || r2 == 0
|
|
addu $12, $2 # r12 = L1 = l1+H(p0+c0+l0)
|
|
# Free: 2,9,10,13,14; Used: r11:r3 = p1+c1 r8=d0 r15=d1 r12=L1
|
|
slt $14, $8, $0 # r14 = n[0] < 0
|
|
addu $8, $8 # r8 = L(2*d0)
|
|
addu $8, $22 # r8 = L(2*d0+(n[-1] < 0))
|
|
addu $9, $15, $15 # r9 = L(2*d1)
|
|
addu $9, $14 # r9 = L(2*d1+(d0 < 0))
|
|
subu $18, 4 # pl -= 4
|
|
subu $19, 2 # nl -= 2
|
|
# enough computation to prevent a stall
|
|
mflo $10 # m0
|
|
mfhi $14 # m1
|
|
addu $11, $10 # r11 = M0 = L(p1+c1+m0)
|
|
sltu $13, $11, $10 # r13 = H(L(p1+c1)+m0)
|
|
# r13+r3 = H(p0+c0+l0)
|
|
multu $15, $15 # d1*d1
|
|
addu $13, $14 # assert before r11 == 0 || r3 == 0
|
|
addu $13, $3 # r13 = M1 = m1+H(p1+c1+m0)
|
|
# Free: 2,3,15; Used: r8:r9 = 2*d0:d1 r10=m0 r11=M0 r12=L1 r13=M1 r14=m1
|
|
addu $10, $11 # r10 = L(m0+M0)
|
|
sltu $11, $10, $11 # r11 = H(m0+M0)
|
|
addu $10, $12 # r10 = N0 = L(M0+m0+L1)
|
|
sw $10, -12($17) # pp[-3] = N0
|
|
sltu $12, $10, $12 # r12 = H(L(m0+M0)+L1)
|
|
# r12+r11 = H(M0+m0+L1)
|
|
addu $14, $11 # assert r11 == 0 || r12 == 0
|
|
addu $14, $12 # r14 = N1 = m1+H(M0+m0+L1)
|
|
addu $14, $13 # r14 = L(M1+N1)
|
|
sltu $13, $14, $13 # r13 = H(M1+N1)
|
|
# enough computation to prevent a stall
|
|
mflo $10 # h0
|
|
mfhi $11 # h1
|
|
addu $2, $10, $14 # c0 = L(M1+N1+h0)
|
|
sltu $14, $2, $14 # r14 = H(L(M1+N1)+h0)
|
|
# r14+r13 = H(M1+N1+h0)
|
|
addu $3, $11, $14 # assert r14 == 0 || r13 == 0
|
|
addu $3, $13 # c1 = H(M1+N1+h0)
|
|
addu $4, $17, -8 # arg1 = pp-2
|
|
addu $5, $18, 2 # arg2 = pl+2
|
|
move $6, $16 # arg3 = nn
|
|
move $7, $19 # arg4 = nl
|
|
jal BnnM2DFastLink
|
|
addu $20, $20, $2
|
|
# 88 }
|
|
bgtu $19, 0, $41
|
|
$42:
|
|
# 89 return c;
|
|
move $2, $20
|
|
ld $16, 24($sp)
|
|
ld $18, 32($sp)
|
|
ld $20, 40($sp)
|
|
lw $22, 48($sp)
|
|
lw $31, 52($sp)
|
|
addu $sp, 56
|
|
j $31
|
|
.end BnnMultiply
|