Index: openssl/crypto/bn/asm/mips.pl |
diff --git a/openssl/crypto/bn/asm/mips.pl b/openssl/crypto/bn/asm/mips.pl |
deleted file mode 100644 |
index 38b51645f067eb0d4a83d5d1277631eb575330ea..0000000000000000000000000000000000000000 |
--- a/openssl/crypto/bn/asm/mips.pl |
+++ /dev/null |
@@ -1,2585 +0,0 @@ |
-#!/usr/bin/env perl |
-# |
-# ==================================================================== |
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
-# project. |
-# |
-# Rights for redistribution and usage in source and binary forms are |
-# granted according to the OpenSSL license. Warranty of any kind is |
-# disclaimed. |
-# ==================================================================== |
- |
- |
-# July 1999 |
-# |
-# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. |
-# |
-# The module is designed to work with either of the "new" MIPS ABI(5), |
-# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under |
-# IRIX 5.x not only because it doesn't support new ABIs but also |
-# because 5.x kernels put R4x00 CPU into 32-bit mode and all those |
-# 64-bit instructions (daddu, dmultu, etc.) found below gonna only |
-# cause illegal instruction exception:-( |
-# |
-# In addition the code depends on preprocessor flags set up by MIPSpro |
-# compiler driver (either as or cc) and therefore (probably?) can't be |
-# compiled by the GNU assembler. GNU C driver manages fine though... |
-# I mean as long as -mmips-as is specified or is the default option, |
-# because then it simply invokes /usr/bin/as which in turn takes |
-# perfect care of the preprocessor definitions. Another neat feature |
-# offered by the MIPSpro assembler is an optimization pass. This gave |
-# me the opportunity to have the code looking more regular as all those |
-# architecture dependent instruction rescheduling details were left to |
-# the assembler. Cool, huh? |
-# |
-# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' |
-# goes way over 3 times faster! |
-# |
-# <appro@fy.chalmers.se> |
- |
-# October 2010 |
-# |
-# Adapt the module even for 32-bit ABIs and other OSes. The former was |
-# achieved by mechanical replacement of 64-bit arithmetic instructions |
-# such as dmultu, daddu, etc. with their 32-bit counterparts and |
-# adjusting offsets denoting multiples of BN_ULONG. Above mentioned |
-# >3x performance improvement naturally does not apply to 32-bit code |
-# [because there is no instruction 32-bit compiler can't use], one |
-# has to content with 40-85% improvement depending on benchmark and |
-# key length, more for longer keys. |
- |
-$flavour = shift; |
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
-open STDOUT,">$output"; |
- |
-if ($flavour =~ /64|n32/i) { |
- $LD="ld"; |
- $ST="sd"; |
- $MULTU="dmultu"; |
- $DIVU="ddivu"; |
- $ADDU="daddu"; |
- $SUBU="dsubu"; |
- $SRL="dsrl"; |
- $SLL="dsll"; |
- $BNSZ=8; |
- $PTR_ADD="daddu"; |
- $PTR_SUB="dsubu"; |
- $SZREG=8; |
- $REG_S="sd"; |
- $REG_L="ld"; |
-} else { |
- $LD="lw"; |
- $ST="sw"; |
- $MULTU="multu"; |
- $DIVU="divu"; |
- $ADDU="addu"; |
- $SUBU="subu"; |
- $SRL="srl"; |
- $SLL="sll"; |
- $BNSZ=4; |
- $PTR_ADD="addu"; |
- $PTR_SUB="subu"; |
- $SZREG=4; |
- $REG_S="sw"; |
- $REG_L="lw"; |
- $code=".set mips2\n"; |
-} |
- |
-# Below is N32/64 register layout used in the original module. |
-# |
-($zero,$at,$v0,$v1)=map("\$$_",(0..3)); |
-($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); |
-($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); |
-($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); |
-($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); |
-($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); |
-# |
-# No special adaptation is required for O32. NUBI on the other hand |
-# is treated by saving/restoring ($v1,$t0..$t3). |
- |
-$gp=$v1 if ($flavour =~ /nubi/i); |
- |
-$minus4=$v1; |
- |
-$code.=<<___; |
-.rdata |
-.asciiz "mips3.s, Version 1.2" |
-.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" |
- |
-.text |
-.set noat |
- |
-.align 5 |
-.globl bn_mul_add_words |
-.ent bn_mul_add_words |
-bn_mul_add_words: |
- .set noreorder |
- bgtz $a2,bn_mul_add_words_internal |
- move $v0,$zero |
- jr $ra |
- move $a0,$v0 |
-.end bn_mul_add_words |
- |
-.align 5 |
-.ent bn_mul_add_words_internal |
-bn_mul_add_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- li $minus4,-4 |
- and $ta0,$a2,$minus4 |
- $LD $t0,0($a1) |
- beqz $ta0,.L_bn_mul_add_words_tail |
- |
-.L_bn_mul_add_words_loop: |
- $MULTU $t0,$a3 |
- $LD $t1,0($a0) |
- $LD $t2,$BNSZ($a1) |
- $LD $t3,$BNSZ($a0) |
- $LD $ta0,2*$BNSZ($a1) |
- $LD $ta1,2*$BNSZ($a0) |
- $ADDU $t1,$v0 |
- sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit |
- # values", but it seems to work fine |
- # even on 64-bit registers. |
- mflo $at |
- mfhi $t0 |
- $ADDU $t1,$at |
- $ADDU $v0,$t0 |
- $MULTU $t2,$a3 |
- sltu $at,$t1,$at |
- $ST $t1,0($a0) |
- $ADDU $v0,$at |
- |
- $LD $ta2,3*$BNSZ($a1) |
- $LD $ta3,3*$BNSZ($a0) |
- $ADDU $t3,$v0 |
- sltu $v0,$t3,$v0 |
- mflo $at |
- mfhi $t2 |
- $ADDU $t3,$at |
- $ADDU $v0,$t2 |
- $MULTU $ta0,$a3 |
- sltu $at,$t3,$at |
- $ST $t3,$BNSZ($a0) |
- $ADDU $v0,$at |
- |
- subu $a2,4 |
- $PTR_ADD $a0,4*$BNSZ |
- $PTR_ADD $a1,4*$BNSZ |
- $ADDU $ta1,$v0 |
- sltu $v0,$ta1,$v0 |
- mflo $at |
- mfhi $ta0 |
- $ADDU $ta1,$at |
- $ADDU $v0,$ta0 |
- $MULTU $ta2,$a3 |
- sltu $at,$ta1,$at |
- $ST $ta1,-2*$BNSZ($a0) |
- $ADDU $v0,$at |
- |
- |
- and $ta0,$a2,$minus4 |
- $ADDU $ta3,$v0 |
- sltu $v0,$ta3,$v0 |
- mflo $at |
- mfhi $ta2 |
- $ADDU $ta3,$at |
- $ADDU $v0,$ta2 |
- sltu $at,$ta3,$at |
- $ST $ta3,-$BNSZ($a0) |
- $ADDU $v0,$at |
- .set noreorder |
- bgtzl $ta0,.L_bn_mul_add_words_loop |
- $LD $t0,0($a1) |
- |
- beqz $a2,.L_bn_mul_add_words_return |
- nop |
- |
-.L_bn_mul_add_words_tail: |
- .set reorder |
- $LD $t0,0($a1) |
- $MULTU $t0,$a3 |
- $LD $t1,0($a0) |
- subu $a2,1 |
- $ADDU $t1,$v0 |
- sltu $v0,$t1,$v0 |
- mflo $at |
- mfhi $t0 |
- $ADDU $t1,$at |
- $ADDU $v0,$t0 |
- sltu $at,$t1,$at |
- $ST $t1,0($a0) |
- $ADDU $v0,$at |
- beqz $a2,.L_bn_mul_add_words_return |
- |
- $LD $t0,$BNSZ($a1) |
- $MULTU $t0,$a3 |
- $LD $t1,$BNSZ($a0) |
- subu $a2,1 |
- $ADDU $t1,$v0 |
- sltu $v0,$t1,$v0 |
- mflo $at |
- mfhi $t0 |
- $ADDU $t1,$at |
- $ADDU $v0,$t0 |
- sltu $at,$t1,$at |
- $ST $t1,$BNSZ($a0) |
- $ADDU $v0,$at |
- beqz $a2,.L_bn_mul_add_words_return |
- |
- $LD $t0,2*$BNSZ($a1) |
- $MULTU $t0,$a3 |
- $LD $t1,2*$BNSZ($a0) |
- $ADDU $t1,$v0 |
- sltu $v0,$t1,$v0 |
- mflo $at |
- mfhi $t0 |
- $ADDU $t1,$at |
- $ADDU $v0,$t0 |
- sltu $at,$t1,$at |
- $ST $t1,2*$BNSZ($a0) |
- $ADDU $v0,$at |
- |
-.L_bn_mul_add_words_return: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
-.end bn_mul_add_words_internal |
- |
-.align 5 |
-.globl bn_mul_words |
-.ent bn_mul_words |
-bn_mul_words: |
- .set noreorder |
- bgtz $a2,bn_mul_words_internal |
- move $v0,$zero |
- jr $ra |
- move $a0,$v0 |
-.end bn_mul_words |
- |
-.align 5 |
-.ent bn_mul_words_internal |
-bn_mul_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- li $minus4,-4 |
- and $ta0,$a2,$minus4 |
- $LD $t0,0($a1) |
- beqz $ta0,.L_bn_mul_words_tail |
- |
-.L_bn_mul_words_loop: |
- $MULTU $t0,$a3 |
- $LD $t2,$BNSZ($a1) |
- $LD $ta0,2*$BNSZ($a1) |
- $LD $ta2,3*$BNSZ($a1) |
- mflo $at |
- mfhi $t0 |
- $ADDU $v0,$at |
- sltu $t1,$v0,$at |
- $MULTU $t2,$a3 |
- $ST $v0,0($a0) |
- $ADDU $v0,$t1,$t0 |
- |
- subu $a2,4 |
- $PTR_ADD $a0,4*$BNSZ |
- $PTR_ADD $a1,4*$BNSZ |
- mflo $at |
- mfhi $t2 |
- $ADDU $v0,$at |
- sltu $t3,$v0,$at |
- $MULTU $ta0,$a3 |
- $ST $v0,-3*$BNSZ($a0) |
- $ADDU $v0,$t3,$t2 |
- |
- mflo $at |
- mfhi $ta0 |
- $ADDU $v0,$at |
- sltu $ta1,$v0,$at |
- $MULTU $ta2,$a3 |
- $ST $v0,-2*$BNSZ($a0) |
- $ADDU $v0,$ta1,$ta0 |
- |
- and $ta0,$a2,$minus4 |
- mflo $at |
- mfhi $ta2 |
- $ADDU $v0,$at |
- sltu $ta3,$v0,$at |
- $ST $v0,-$BNSZ($a0) |
- $ADDU $v0,$ta3,$ta2 |
- .set noreorder |
- bgtzl $ta0,.L_bn_mul_words_loop |
- $LD $t0,0($a1) |
- |
- beqz $a2,.L_bn_mul_words_return |
- nop |
- |
-.L_bn_mul_words_tail: |
- .set reorder |
- $LD $t0,0($a1) |
- $MULTU $t0,$a3 |
- subu $a2,1 |
- mflo $at |
- mfhi $t0 |
- $ADDU $v0,$at |
- sltu $t1,$v0,$at |
- $ST $v0,0($a0) |
- $ADDU $v0,$t1,$t0 |
- beqz $a2,.L_bn_mul_words_return |
- |
- $LD $t0,$BNSZ($a1) |
- $MULTU $t0,$a3 |
- subu $a2,1 |
- mflo $at |
- mfhi $t0 |
- $ADDU $v0,$at |
- sltu $t1,$v0,$at |
- $ST $v0,$BNSZ($a0) |
- $ADDU $v0,$t1,$t0 |
- beqz $a2,.L_bn_mul_words_return |
- |
- $LD $t0,2*$BNSZ($a1) |
- $MULTU $t0,$a3 |
- mflo $at |
- mfhi $t0 |
- $ADDU $v0,$at |
- sltu $t1,$v0,$at |
- $ST $v0,2*$BNSZ($a0) |
- $ADDU $v0,$t1,$t0 |
- |
-.L_bn_mul_words_return: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
-.end bn_mul_words_internal |
- |
-.align 5 |
-.globl bn_sqr_words |
-.ent bn_sqr_words |
-bn_sqr_words: |
- .set noreorder |
- bgtz $a2,bn_sqr_words_internal |
- move $v0,$zero |
- jr $ra |
- move $a0,$v0 |
-.end bn_sqr_words |
- |
-.align 5 |
-.ent bn_sqr_words_internal |
-bn_sqr_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- li $minus4,-4 |
- and $ta0,$a2,$minus4 |
- $LD $t0,0($a1) |
- beqz $ta0,.L_bn_sqr_words_tail |
- |
-.L_bn_sqr_words_loop: |
- $MULTU $t0,$t0 |
- $LD $t2,$BNSZ($a1) |
- $LD $ta0,2*$BNSZ($a1) |
- $LD $ta2,3*$BNSZ($a1) |
- mflo $t1 |
- mfhi $t0 |
- $ST $t1,0($a0) |
- $ST $t0,$BNSZ($a0) |
- |
- $MULTU $t2,$t2 |
- subu $a2,4 |
- $PTR_ADD $a0,8*$BNSZ |
- $PTR_ADD $a1,4*$BNSZ |
- mflo $t3 |
- mfhi $t2 |
- $ST $t3,-6*$BNSZ($a0) |
- $ST $t2,-5*$BNSZ($a0) |
- |
- $MULTU $ta0,$ta0 |
- mflo $ta1 |
- mfhi $ta0 |
- $ST $ta1,-4*$BNSZ($a0) |
- $ST $ta0,-3*$BNSZ($a0) |
- |
- |
- $MULTU $ta2,$ta2 |
- and $ta0,$a2,$minus4 |
- mflo $ta3 |
- mfhi $ta2 |
- $ST $ta3,-2*$BNSZ($a0) |
- $ST $ta2,-$BNSZ($a0) |
- |
- .set noreorder |
- bgtzl $ta0,.L_bn_sqr_words_loop |
- $LD $t0,0($a1) |
- |
- beqz $a2,.L_bn_sqr_words_return |
- nop |
- |
-.L_bn_sqr_words_tail: |
- .set reorder |
- $LD $t0,0($a1) |
- $MULTU $t0,$t0 |
- subu $a2,1 |
- mflo $t1 |
- mfhi $t0 |
- $ST $t1,0($a0) |
- $ST $t0,$BNSZ($a0) |
- beqz $a2,.L_bn_sqr_words_return |
- |
- $LD $t0,$BNSZ($a1) |
- $MULTU $t0,$t0 |
- subu $a2,1 |
- mflo $t1 |
- mfhi $t0 |
- $ST $t1,2*$BNSZ($a0) |
- $ST $t0,3*$BNSZ($a0) |
- beqz $a2,.L_bn_sqr_words_return |
- |
- $LD $t0,2*$BNSZ($a1) |
- $MULTU $t0,$t0 |
- mflo $t1 |
- mfhi $t0 |
- $ST $t1,4*$BNSZ($a0) |
- $ST $t0,5*$BNSZ($a0) |
- |
-.L_bn_sqr_words_return: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
- |
-.end bn_sqr_words_internal |
- |
-.align 5 |
-.globl bn_add_words |
-.ent bn_add_words |
-bn_add_words: |
- .set noreorder |
- bgtz $a3,bn_add_words_internal |
- move $v0,$zero |
- jr $ra |
- move $a0,$v0 |
-.end bn_add_words |
- |
-.align 5 |
-.ent bn_add_words_internal |
-bn_add_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- li $minus4,-4 |
- and $at,$a3,$minus4 |
- $LD $t0,0($a1) |
- beqz $at,.L_bn_add_words_tail |
- |
-.L_bn_add_words_loop: |
- $LD $ta0,0($a2) |
- subu $a3,4 |
- $LD $t1,$BNSZ($a1) |
- and $at,$a3,$minus4 |
- $LD $t2,2*$BNSZ($a1) |
- $PTR_ADD $a2,4*$BNSZ |
- $LD $t3,3*$BNSZ($a1) |
- $PTR_ADD $a0,4*$BNSZ |
- $LD $ta1,-3*$BNSZ($a2) |
- $PTR_ADD $a1,4*$BNSZ |
- $LD $ta2,-2*$BNSZ($a2) |
- $LD $ta3,-$BNSZ($a2) |
- $ADDU $ta0,$t0 |
- sltu $t8,$ta0,$t0 |
- $ADDU $t0,$ta0,$v0 |
- sltu $v0,$t0,$ta0 |
- $ST $t0,-4*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
- $ADDU $ta1,$t1 |
- sltu $t9,$ta1,$t1 |
- $ADDU $t1,$ta1,$v0 |
- sltu $v0,$t1,$ta1 |
- $ST $t1,-3*$BNSZ($a0) |
- $ADDU $v0,$t9 |
- |
- $ADDU $ta2,$t2 |
- sltu $t8,$ta2,$t2 |
- $ADDU $t2,$ta2,$v0 |
- sltu $v0,$t2,$ta2 |
- $ST $t2,-2*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
- $ADDU $ta3,$t3 |
- sltu $t9,$ta3,$t3 |
- $ADDU $t3,$ta3,$v0 |
- sltu $v0,$t3,$ta3 |
- $ST $t3,-$BNSZ($a0) |
- $ADDU $v0,$t9 |
- |
- .set noreorder |
- bgtzl $at,.L_bn_add_words_loop |
- $LD $t0,0($a1) |
- |
- beqz $a3,.L_bn_add_words_return |
- nop |
- |
-.L_bn_add_words_tail: |
- .set reorder |
- $LD $t0,0($a1) |
- $LD $ta0,0($a2) |
- $ADDU $ta0,$t0 |
- subu $a3,1 |
- sltu $t8,$ta0,$t0 |
- $ADDU $t0,$ta0,$v0 |
- sltu $v0,$t0,$ta0 |
- $ST $t0,0($a0) |
- $ADDU $v0,$t8 |
- beqz $a3,.L_bn_add_words_return |
- |
- $LD $t1,$BNSZ($a1) |
- $LD $ta1,$BNSZ($a2) |
- $ADDU $ta1,$t1 |
- subu $a3,1 |
- sltu $t9,$ta1,$t1 |
- $ADDU $t1,$ta1,$v0 |
- sltu $v0,$t1,$ta1 |
- $ST $t1,$BNSZ($a0) |
- $ADDU $v0,$t9 |
- beqz $a3,.L_bn_add_words_return |
- |
- $LD $t2,2*$BNSZ($a1) |
- $LD $ta2,2*$BNSZ($a2) |
- $ADDU $ta2,$t2 |
- sltu $t8,$ta2,$t2 |
- $ADDU $t2,$ta2,$v0 |
- sltu $v0,$t2,$ta2 |
- $ST $t2,2*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
-.L_bn_add_words_return: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
- |
-.end bn_add_words_internal |
- |
-.align 5 |
-.globl bn_sub_words |
-.ent bn_sub_words |
-bn_sub_words: |
- .set noreorder |
- bgtz $a3,bn_sub_words_internal |
- move $v0,$zero |
- jr $ra |
- move $a0,$zero |
-.end bn_sub_words |
- |
-.align 5 |
-.ent bn_sub_words_internal |
-bn_sub_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- li $minus4,-4 |
- and $at,$a3,$minus4 |
- $LD $t0,0($a1) |
- beqz $at,.L_bn_sub_words_tail |
- |
-.L_bn_sub_words_loop: |
- $LD $ta0,0($a2) |
- subu $a3,4 |
- $LD $t1,$BNSZ($a1) |
- and $at,$a3,$minus4 |
- $LD $t2,2*$BNSZ($a1) |
- $PTR_ADD $a2,4*$BNSZ |
- $LD $t3,3*$BNSZ($a1) |
- $PTR_ADD $a0,4*$BNSZ |
- $LD $ta1,-3*$BNSZ($a2) |
- $PTR_ADD $a1,4*$BNSZ |
- $LD $ta2,-2*$BNSZ($a2) |
- $LD $ta3,-$BNSZ($a2) |
- sltu $t8,$t0,$ta0 |
- $SUBU $ta0,$t0,$ta0 |
- $SUBU $t0,$ta0,$v0 |
- sgtu $v0,$t0,$ta0 |
- $ST $t0,-4*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
- sltu $t9,$t1,$ta1 |
- $SUBU $ta1,$t1,$ta1 |
- $SUBU $t1,$ta1,$v0 |
- sgtu $v0,$t1,$ta1 |
- $ST $t1,-3*$BNSZ($a0) |
- $ADDU $v0,$t9 |
- |
- |
- sltu $t8,$t2,$ta2 |
- $SUBU $ta2,$t2,$ta2 |
- $SUBU $t2,$ta2,$v0 |
- sgtu $v0,$t2,$ta2 |
- $ST $t2,-2*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
- sltu $t9,$t3,$ta3 |
- $SUBU $ta3,$t3,$ta3 |
- $SUBU $t3,$ta3,$v0 |
- sgtu $v0,$t3,$ta3 |
- $ST $t3,-$BNSZ($a0) |
- $ADDU $v0,$t9 |
- |
- .set noreorder |
- bgtzl $at,.L_bn_sub_words_loop |
- $LD $t0,0($a1) |
- |
- beqz $a3,.L_bn_sub_words_return |
- nop |
- |
-.L_bn_sub_words_tail: |
- .set reorder |
- $LD $t0,0($a1) |
- $LD $ta0,0($a2) |
- subu $a3,1 |
- sltu $t8,$t0,$ta0 |
- $SUBU $ta0,$t0,$ta0 |
- $SUBU $t0,$ta0,$v0 |
- sgtu $v0,$t0,$ta0 |
- $ST $t0,0($a0) |
- $ADDU $v0,$t8 |
- beqz $a3,.L_bn_sub_words_return |
- |
- $LD $t1,$BNSZ($a1) |
- subu $a3,1 |
- $LD $ta1,$BNSZ($a2) |
- sltu $t9,$t1,$ta1 |
- $SUBU $ta1,$t1,$ta1 |
- $SUBU $t1,$ta1,$v0 |
- sgtu $v0,$t1,$ta1 |
- $ST $t1,$BNSZ($a0) |
- $ADDU $v0,$t9 |
- beqz $a3,.L_bn_sub_words_return |
- |
- $LD $t2,2*$BNSZ($a1) |
- $LD $ta2,2*$BNSZ($a2) |
- sltu $t8,$t2,$ta2 |
- $SUBU $ta2,$t2,$ta2 |
- $SUBU $t2,$ta2,$v0 |
- sgtu $v0,$t2,$ta2 |
- $ST $t2,2*$BNSZ($a0) |
- $ADDU $v0,$t8 |
- |
-.L_bn_sub_words_return: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
-.end bn_sub_words_internal |
- |
-.align 5 |
-.globl bn_div_3_words |
-.ent bn_div_3_words |
-bn_div_3_words: |
- .set noreorder |
- move $a3,$a0 # we know that bn_div_words does not |
- # touch $a3, $ta2, $ta3 and preserves $a2 |
- # so that we can save two arguments |
- # and return address in registers |
- # instead of stack:-) |
- |
- $LD $a0,($a3) |
- move $ta2,$a1 |
- bne $a0,$a2,bn_div_3_words_internal |
- $LD $a1,-$BNSZ($a3) |
- li $v0,-1 |
- jr $ra |
- move $a0,$v0 |
-.end bn_div_3_words |
- |
-.align 5 |
-.ent bn_div_3_words_internal |
-bn_div_3_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- move $ta3,$ra |
- bal bn_div_words_internal |
- move $ra,$ta3 |
- $MULTU $ta2,$v0 |
- $LD $t2,-2*$BNSZ($a3) |
- move $ta0,$zero |
- mfhi $t1 |
- mflo $t0 |
- sltu $t8,$t1,$a1 |
-.L_bn_div_3_words_inner_loop: |
- bnez $t8,.L_bn_div_3_words_inner_loop_done |
- sgeu $at,$t2,$t0 |
- seq $t9,$t1,$a1 |
- and $at,$t9 |
- sltu $t3,$t0,$ta2 |
- $ADDU $a1,$a2 |
- $SUBU $t1,$t3 |
- $SUBU $t0,$ta2 |
- sltu $t8,$t1,$a1 |
- sltu $ta0,$a1,$a2 |
- or $t8,$ta0 |
- .set noreorder |
- beqzl $at,.L_bn_div_3_words_inner_loop |
- $SUBU $v0,1 |
- .set reorder |
-.L_bn_div_3_words_inner_loop_done: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
-.end bn_div_3_words_internal |
- |
-.align 5 |
-.globl bn_div_words |
-.ent bn_div_words |
-bn_div_words: |
- .set noreorder |
- bnez $a2,bn_div_words_internal |
- li $v0,-1 # I would rather signal div-by-zero |
- # which can be done with 'break 7' |
- jr $ra |
- move $a0,$v0 |
-.end bn_div_words |
- |
-.align 5 |
-.ent bn_div_words_internal |
-bn_div_words_internal: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- move $v1,$zero |
- bltz $a2,.L_bn_div_words_body |
- move $t9,$v1 |
- $SLL $a2,1 |
- bgtz $a2,.-4 |
- addu $t9,1 |
- |
- .set reorder |
- negu $t1,$t9 |
- li $t2,-1 |
- $SLL $t2,$t1 |
- and $t2,$a0 |
- $SRL $at,$a1,$t1 |
- .set noreorder |
- bnezl $t2,.+8 |
- break 6 # signal overflow |
- .set reorder |
- $SLL $a0,$t9 |
- $SLL $a1,$t9 |
- or $a0,$at |
-___ |
-$QT=$ta0; |
-$HH=$ta1; |
-$DH=$v1; |
-$code.=<<___; |
-.L_bn_div_words_body: |
- $SRL $DH,$a2,4*$BNSZ # bits |
- sgeu $at,$a0,$a2 |
- .set noreorder |
- bnezl $at,.+8 |
- $SUBU $a0,$a2 |
- .set reorder |
- |
- li $QT,-1 |
- $SRL $HH,$a0,4*$BNSZ # bits |
- $SRL $QT,4*$BNSZ # q=0xffffffff |
- beq $DH,$HH,.L_bn_div_words_skip_div1 |
- $DIVU $zero,$a0,$DH |
- mflo $QT |
-.L_bn_div_words_skip_div1: |
- $MULTU $a2,$QT |
- $SLL $t3,$a0,4*$BNSZ # bits |
- $SRL $at,$a1,4*$BNSZ # bits |
- or $t3,$at |
- mflo $t0 |
- mfhi $t1 |
-.L_bn_div_words_inner_loop1: |
- sltu $t2,$t3,$t0 |
- seq $t8,$HH,$t1 |
- sltu $at,$HH,$t1 |
- and $t2,$t8 |
- sltu $v0,$t0,$a2 |
- or $at,$t2 |
- .set noreorder |
- beqz $at,.L_bn_div_words_inner_loop1_done |
- $SUBU $t1,$v0 |
- $SUBU $t0,$a2 |
- b .L_bn_div_words_inner_loop1 |
- $SUBU $QT,1 |
- .set reorder |
-.L_bn_div_words_inner_loop1_done: |
- |
- $SLL $a1,4*$BNSZ # bits |
- $SUBU $a0,$t3,$t0 |
- $SLL $v0,$QT,4*$BNSZ # bits |
- |
- li $QT,-1 |
- $SRL $HH,$a0,4*$BNSZ # bits |
- $SRL $QT,4*$BNSZ # q=0xffffffff |
- beq $DH,$HH,.L_bn_div_words_skip_div2 |
- $DIVU $zero,$a0,$DH |
- mflo $QT |
-.L_bn_div_words_skip_div2: |
- $MULTU $a2,$QT |
- $SLL $t3,$a0,4*$BNSZ # bits |
- $SRL $at,$a1,4*$BNSZ # bits |
- or $t3,$at |
- mflo $t0 |
- mfhi $t1 |
-.L_bn_div_words_inner_loop2: |
- sltu $t2,$t3,$t0 |
- seq $t8,$HH,$t1 |
- sltu $at,$HH,$t1 |
- and $t2,$t8 |
- sltu $v1,$t0,$a2 |
- or $at,$t2 |
- .set noreorder |
- beqz $at,.L_bn_div_words_inner_loop2_done |
- $SUBU $t1,$v1 |
- $SUBU $t0,$a2 |
- b .L_bn_div_words_inner_loop2 |
- $SUBU $QT,1 |
- .set reorder |
-.L_bn_div_words_inner_loop2_done: |
- |
- $SUBU $a0,$t3,$t0 |
- or $v0,$QT |
- $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it |
- $SRL $a2,$t9 # restore $a2 |
- |
- .set noreorder |
- move $a1,$v1 |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- move $a0,$v0 |
-.end bn_div_words_internal |
-___ |
-undef $HH; undef $QT; undef $DH; |
- |
-($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); |
-($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); |
- |
-($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 |
-($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 |
- |
-($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); |
- |
-$code.=<<___; |
- |
-.align 5 |
-.globl bn_mul_comba8 |
-.ent bn_mul_comba8 |
-bn_mul_comba8: |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,12*$SZREG,$ra |
- .mask 0x803ff008,-$SZREG |
- $PTR_SUB $sp,12*$SZREG |
- $REG_S $ra,11*$SZREG($sp) |
- $REG_S $s5,10*$SZREG($sp) |
- $REG_S $s4,9*$SZREG($sp) |
- $REG_S $s3,8*$SZREG($sp) |
- $REG_S $s2,7*$SZREG($sp) |
- $REG_S $s1,6*$SZREG($sp) |
- $REG_S $s0,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___ if ($flavour !~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x003f0000,-$SZREG |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $s5,5*$SZREG($sp) |
- $REG_S $s4,4*$SZREG($sp) |
- $REG_S $s3,3*$SZREG($sp) |
- $REG_S $s2,2*$SZREG($sp) |
- $REG_S $s1,1*$SZREG($sp) |
- $REG_S $s0,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- |
- .set reorder |
- $LD $a_0,0($a1) # If compiled with -mips3 option on |
- # R5000 box assembler barks on this |
- # 1ine with "should not have mult/div |
- # as last instruction in bb (R10K |
- # bug)" warning. If anybody out there |
- # has a clue about how to circumvent |
- # this do send me a note. |
- # <appro\@fy.chalmers.se> |
- |
- $LD $b_0,0($a2) |
- $LD $a_1,$BNSZ($a1) |
- $LD $a_2,2*$BNSZ($a1) |
- $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); |
- $LD $a_3,3*$BNSZ($a1) |
- $LD $b_1,$BNSZ($a2) |
- $LD $b_2,2*$BNSZ($a2) |
- $LD $b_3,3*$BNSZ($a2) |
- mflo $c_1 |
- mfhi $c_2 |
- |
- $LD $a_4,4*$BNSZ($a1) |
- $LD $a_5,5*$BNSZ($a1) |
- $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); |
- $LD $a_6,6*$BNSZ($a1) |
- $LD $a_7,7*$BNSZ($a1) |
- $LD $b_4,4*$BNSZ($a2) |
- $LD $b_5,5*$BNSZ($a2) |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); |
- $ADDU $c_3,$t_2,$at |
- $LD $b_6,6*$BNSZ($a2) |
- $LD $b_7,7*$BNSZ($a2) |
- $ST $c_1,0($a0) # r[0]=c1; |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- $ST $c_2,$BNSZ($a0) # r[1]=c2; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,2*$BNSZ($a0) # r[2]=c3; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $c_3,$c_2,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,3*$BNSZ($a0) # r[3]=c1; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,4*$BNSZ($a0) # r[4]=c2; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,5*$BNSZ($a0) # r[5]=c3; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $c_3,$c_2,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,6*$BNSZ($a0) # r[6]=c1; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,7*$BNSZ($a0) # r[7]=c2; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,8*$BNSZ($a0) # r[8]=c3; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $c_3,$c_2,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,9*$BNSZ($a0) # r[9]=c1; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,10*$BNSZ($a0) # r[10]=c2; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,11*$BNSZ($a0) # r[11]=c3; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $c_3,$c_2,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,12*$BNSZ($a0) # r[12]=c1; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,13*$BNSZ($a0) # r[13]=c2; |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- $ST $c_3,14*$BNSZ($a0) # r[14]=c3; |
- $ST $c_1,15*$BNSZ($a0) # r[15]=c1; |
- |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $s5,10*$SZREG($sp) |
- $REG_L $s4,9*$SZREG($sp) |
- $REG_L $s3,8*$SZREG($sp) |
- $REG_L $s2,7*$SZREG($sp) |
- $REG_L $s1,6*$SZREG($sp) |
- $REG_L $s0,5*$SZREG($sp) |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- jr $ra |
- $PTR_ADD $sp,12*$SZREG |
-___ |
-$code.=<<___ if ($flavour !~ /nubi/i); |
- $REG_L $s5,5*$SZREG($sp) |
- $REG_L $s4,4*$SZREG($sp) |
- $REG_L $s3,3*$SZREG($sp) |
- $REG_L $s2,2*$SZREG($sp) |
- $REG_L $s1,1*$SZREG($sp) |
- $REG_L $s0,0*$SZREG($sp) |
- jr $ra |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
-.end bn_mul_comba8 |
- |
-.align 5 |
-.globl bn_mul_comba4 |
-.ent bn_mul_comba4 |
-bn_mul_comba4: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- $LD $a_0,0($a1) |
- $LD $b_0,0($a2) |
- $LD $a_1,$BNSZ($a1) |
- $LD $a_2,2*$BNSZ($a1) |
- $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); |
- $LD $a_3,3*$BNSZ($a1) |
- $LD $b_1,$BNSZ($a2) |
- $LD $b_2,2*$BNSZ($a2) |
- $LD $b_3,3*$BNSZ($a2) |
- mflo $c_1 |
- mfhi $c_2 |
- $ST $c_1,0($a0) |
- |
- $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); |
- $ADDU $c_3,$t_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- $ST $c_2,$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,2*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $c_3,$c_2,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,3*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $c_1,$c_3,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,4*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $c_2,$c_1,$t_2 |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,5*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- $ST $c_1,6*$BNSZ($a0) |
- $ST $c_2,7*$BNSZ($a0) |
- |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- nop |
-.end bn_mul_comba4 |
-___ |
- |
-($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); |
- |
-$code.=<<___; |
- |
-.align 5 |
-.globl bn_sqr_comba8 |
-.ent bn_sqr_comba8 |
-bn_sqr_comba8: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- $LD $a_0,0($a1) |
- $LD $a_1,$BNSZ($a1) |
- $LD $a_2,2*$BNSZ($a1) |
- $LD $a_3,3*$BNSZ($a1) |
- |
- $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); |
- $LD $a_4,4*$BNSZ($a1) |
- $LD $a_5,5*$BNSZ($a1) |
- $LD $a_6,6*$BNSZ($a1) |
- $LD $a_7,7*$BNSZ($a1) |
- mflo $c_1 |
- mfhi $c_2 |
- $ST $c_1,0($a0) |
- |
- $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $c_3,$t_2,$at |
- $ST $c_2,$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,2*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_3,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,3*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_1,$at |
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,4*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_2,$at |
- $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); |
- $ADDU $c_2,$at |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,5*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_3,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,6*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_1,$at |
- $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_1,$at |
- $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_1,$at |
- $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,7*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_2,$at |
- $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_2,$at |
- $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,8*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_3,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,9*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_1,$at |
- $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,10*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_2,$at |
- $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,11*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_3,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,12*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,13*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- $ST $c_3,14*$BNSZ($a0) |
- $ST $c_1,15*$BNSZ($a0) |
- |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- nop |
-.end bn_sqr_comba8 |
- |
-.align 5 |
-.globl bn_sqr_comba4 |
-.ent bn_sqr_comba4 |
-bn_sqr_comba4: |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- .frame $sp,6*$SZREG,$ra |
- .mask 0x8000f008,-$SZREG |
- .set noreorder |
- $PTR_SUB $sp,6*$SZREG |
- $REG_S $ra,5*$SZREG($sp) |
- $REG_S $t3,4*$SZREG($sp) |
- $REG_S $t2,3*$SZREG($sp) |
- $REG_S $t1,2*$SZREG($sp) |
- $REG_S $t0,1*$SZREG($sp) |
- $REG_S $gp,0*$SZREG($sp) |
-___ |
-$code.=<<___; |
- .set reorder |
- $LD $a_0,0($a1) |
- $LD $a_1,$BNSZ($a1) |
- $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); |
- $LD $a_2,2*$BNSZ($a1) |
- $LD $a_3,3*$BNSZ($a1) |
- mflo $c_1 |
- mfhi $c_2 |
- $ST $c_1,0($a0) |
- |
- $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $c_3,$t_2,$at |
- $ST $c_2,$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,2*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_3,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- mflo $t_1 |
- mfhi $t_2 |
- slt $at,$t_2,$zero |
- $ADDU $c_3,$at |
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); |
- $SLL $t_2,1 |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- sltu $at,$c_2,$t_2 |
- $ADDU $c_3,$at |
- $ST $c_1,3*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_1,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_2,$t_1 |
- sltu $at,$c_2,$t_1 |
- $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); |
- $ADDU $t_2,$at |
- $ADDU $c_3,$t_2 |
- sltu $at,$c_3,$t_2 |
- $ADDU $c_1,$at |
- $ST $c_2,4*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- slt $c_2,$t_2,$zero |
- $SLL $t_2,1 |
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); |
- slt $a2,$t_1,$zero |
- $ADDU $t_2,$a2 |
- $SLL $t_1,1 |
- $ADDU $c_3,$t_1 |
- sltu $at,$c_3,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_1,$t_2 |
- sltu $at,$c_1,$t_2 |
- $ADDU $c_2,$at |
- $ST $c_3,5*$BNSZ($a0) |
- |
- mflo $t_1 |
- mfhi $t_2 |
- $ADDU $c_1,$t_1 |
- sltu $at,$c_1,$t_1 |
- $ADDU $t_2,$at |
- $ADDU $c_2,$t_2 |
- $ST $c_1,6*$BNSZ($a0) |
- $ST $c_2,7*$BNSZ($a0) |
- |
- .set noreorder |
-___ |
-$code.=<<___ if ($flavour =~ /nubi/i); |
- $REG_L $t3,4*$SZREG($sp) |
- $REG_L $t2,3*$SZREG($sp) |
- $REG_L $t1,2*$SZREG($sp) |
- $REG_L $t0,1*$SZREG($sp) |
- $REG_L $gp,0*$SZREG($sp) |
- $PTR_ADD $sp,6*$SZREG |
-___ |
-$code.=<<___; |
- jr $ra |
- nop |
-.end bn_sqr_comba4 |
-___ |
-print $code; |
-close STDOUT; |