| Index: third_party/libwebp/dsp/enc_mips32.c
|
| diff --git a/third_party/libwebp/dsp/enc_mips32.c b/third_party/libwebp/dsp/enc_mips32.c
|
| index 6cede18ad619dfd144a7874043f9cde549a411c6..fd10143de9068199fff9fb7e68f332ec7c953413 100644
|
| --- a/third_party/libwebp/dsp/enc_mips32.c
|
| +++ b/third_party/libwebp/dsp/enc_mips32.c
|
| @@ -17,13 +17,10 @@
|
|
|
| #if defined(WEBP_USE_MIPS32)
|
|
|
| +#include "./mips_macro.h"
|
| #include "../enc/vp8enci.h"
|
| #include "../enc/cost.h"
|
|
|
| -#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
|
| -#define WORK_AROUND_GCC
|
| -#endif
|
| -
|
| static const int kC1 = 20091 + (1 << 16);
|
| static const int kC2 = 35468;
|
|
|
| @@ -59,61 +56,61 @@ static const int kC2 = 35468;
|
| // MUL and STORE macros inlined
|
| // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
|
| // temp0..temp15 holds tmp[0]..tmp[15]
|
| -// A..D - offsets in bytes to load from ref and store to dst buffer
|
| +// A - offset in bytes to load from ref and store to dst buffer
|
| // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
|
| -#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
|
| - "addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
|
| - "addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| - "subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| - "mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
|
| - "mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
|
| - "mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
|
| - "mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
|
| - "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
|
| - "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
|
| - "sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
|
| - "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
|
| - "subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| - "addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
|
| - "addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
|
| - "addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
|
| - "subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
|
| - "subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
|
| - "lw %[temp20], 0(%[args]) \n\t" \
|
| - "sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
|
| - "sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
|
| - "sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
|
| - "sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
|
| - "lbu %[temp16], " #A "(%[temp20]) \n\t" \
|
| - "lbu %[temp17], " #B "(%[temp20]) \n\t" \
|
| - "lbu %[temp18], " #C "(%[temp20]) \n\t" \
|
| - "lbu %[temp19], " #D "(%[temp20]) \n\t" \
|
| - "addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
|
| - "addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
|
| - "addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
|
| - "addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
|
| - "slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
|
| - "slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
|
| - "slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
|
| - "slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
|
| - "movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
|
| - "movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
|
| - "movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
|
| - "movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
|
| - "addiu %[temp20], $zero, 255 \n\t" \
|
| - "slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
|
| - "slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
|
| - "slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
|
| - "slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
|
| - "movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
|
| - "movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
|
| - "lw %[temp16], 8(%[args]) \n\t" \
|
| - "movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
|
| - "movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
|
| - "sb %[" #TEMP0 "], " #A "(%[temp16]) \n\t" \
|
| - "sb %[" #TEMP4 "], " #B "(%[temp16]) \n\t" \
|
| - "sb %[" #TEMP8 "], " #C "(%[temp16]) \n\t" \
|
| - "sb %[" #TEMP12 "], " #D "(%[temp16]) \n\t"
|
| +#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
|
| + "addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
|
| + "addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| + "subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| + "mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
|
| + "mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
|
| + "mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
|
| + "mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
|
| + "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
|
| + "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
|
| + "sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
|
| + "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
|
| + "subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
|
| + "addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
|
| + "addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
|
| + "addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
|
| + "subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
|
| + "subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
|
| + "lw %[temp20], 0(%[args]) \n\t" \
|
| + "sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
|
| + "sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
|
| + "sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
|
| + "sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
|
| + "lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
|
| + "lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
|
| + "lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
|
| + "lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
|
| + "addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
|
| + "addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
|
| + "addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
|
| + "addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
|
| + "slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
|
| + "slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
|
| + "slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
|
| + "slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
|
| + "movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
|
| + "movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
|
| + "movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
|
| + "movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
|
| + "addiu %[temp20], $zero, 255 \n\t" \
|
| + "slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
|
| + "slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
|
| + "slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
|
| + "slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
|
| + "movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
|
| + "movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
|
| + "lw %[temp16], 8(%[args]) \n\t" \
|
| + "movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
|
| + "movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
|
| + "sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
|
| + "sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
|
| + "sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
|
| + "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
|
|
|
| // Does one or two inverse transforms.
|
| static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
| @@ -130,10 +127,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
|
| VERTICAL_PASS(4, 20, 12, 28, temp12, temp8, temp9, temp10, temp11)
|
| VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
|
|
|
| - HORIZONTAL_PASS( 0, 1, 2, 3, temp0, temp4, temp8, temp12)
|
| - HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9, temp13)
|
| - HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
|
| - HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
|
| + HORIZONTAL_PASS(0, temp0, temp4, temp8, temp12)
|
| + HORIZONTAL_PASS(1, temp1, temp5, temp9, temp13)
|
| + HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
|
| + HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
|
|
|
| : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
| [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
| @@ -241,46 +238,54 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
|
| return 0;
|
| }
|
|
|
| +static int Quantize2Blocks(int16_t in[32], int16_t out[32],
|
| + const VP8Matrix* const mtx) {
|
| + int nz;
|
| + nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
|
| + nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
|
| + return nz;
|
| +}
|
| +
|
| #undef QUANTIZE_ONE
|
|
|
| // macro for one horizontal pass in Disto4x4 (TTransform)
|
| // two calls of function TTransform are merged into single one
|
| -// A..D - offsets in bytes to load from a and b buffers
|
| +// A - offset in bytes to load from a and b buffers
|
| // E..H - offsets in bytes to store first results to tmp buffer
|
| // E1..H1 - offsets in bytes to store second results to tmp buffer
|
| -#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1) \
|
| - "lbu %[temp0], " #A "(%[a]) \n\t" \
|
| - "lbu %[temp1], " #B "(%[a]) \n\t" \
|
| - "lbu %[temp2], " #C "(%[a]) \n\t" \
|
| - "lbu %[temp3], " #D "(%[a]) \n\t" \
|
| - "lbu %[temp4], " #A "(%[b]) \n\t" \
|
| - "lbu %[temp5], " #B "(%[b]) \n\t" \
|
| - "lbu %[temp6], " #C "(%[b]) \n\t" \
|
| - "lbu %[temp7], " #D "(%[b]) \n\t" \
|
| - "addu %[temp8], %[temp0], %[temp2] \n\t" \
|
| - "subu %[temp0], %[temp0], %[temp2] \n\t" \
|
| - "addu %[temp2], %[temp1], %[temp3] \n\t" \
|
| - "subu %[temp1], %[temp1], %[temp3] \n\t" \
|
| - "addu %[temp3], %[temp4], %[temp6] \n\t" \
|
| - "subu %[temp4], %[temp4], %[temp6] \n\t" \
|
| - "addu %[temp6], %[temp5], %[temp7] \n\t" \
|
| - "subu %[temp5], %[temp5], %[temp7] \n\t" \
|
| - "addu %[temp7], %[temp8], %[temp2] \n\t" \
|
| - "subu %[temp2], %[temp8], %[temp2] \n\t" \
|
| - "addu %[temp8], %[temp0], %[temp1] \n\t" \
|
| - "subu %[temp0], %[temp0], %[temp1] \n\t" \
|
| - "addu %[temp1], %[temp3], %[temp6] \n\t" \
|
| - "subu %[temp3], %[temp3], %[temp6] \n\t" \
|
| - "addu %[temp6], %[temp4], %[temp5] \n\t" \
|
| - "subu %[temp4], %[temp4], %[temp5] \n\t" \
|
| - "sw %[temp7], " #E "(%[tmp]) \n\t" \
|
| - "sw %[temp2], " #H "(%[tmp]) \n\t" \
|
| - "sw %[temp8], " #F "(%[tmp]) \n\t" \
|
| - "sw %[temp0], " #G "(%[tmp]) \n\t" \
|
| - "sw %[temp1], " #E1 "(%[tmp]) \n\t" \
|
| - "sw %[temp3], " #H1 "(%[tmp]) \n\t" \
|
| - "sw %[temp6], " #F1 "(%[tmp]) \n\t" \
|
| - "sw %[temp4], " #G1 "(%[tmp]) \n\t"
|
| +#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
|
| + "lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
|
| + "lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
|
| + "lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
|
| + "lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
|
| + "lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
|
| + "lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
|
| + "lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
|
| + "lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
|
| + "addu %[temp8], %[temp0], %[temp2] \n\t" \
|
| + "subu %[temp0], %[temp0], %[temp2] \n\t" \
|
| + "addu %[temp2], %[temp1], %[temp3] \n\t" \
|
| + "subu %[temp1], %[temp1], %[temp3] \n\t" \
|
| + "addu %[temp3], %[temp4], %[temp6] \n\t" \
|
| + "subu %[temp4], %[temp4], %[temp6] \n\t" \
|
| + "addu %[temp6], %[temp5], %[temp7] \n\t" \
|
| + "subu %[temp5], %[temp5], %[temp7] \n\t" \
|
| + "addu %[temp7], %[temp8], %[temp2] \n\t" \
|
| + "subu %[temp2], %[temp8], %[temp2] \n\t" \
|
| + "addu %[temp8], %[temp0], %[temp1] \n\t" \
|
| + "subu %[temp0], %[temp0], %[temp1] \n\t" \
|
| + "addu %[temp1], %[temp3], %[temp6] \n\t" \
|
| + "subu %[temp3], %[temp3], %[temp6] \n\t" \
|
| + "addu %[temp6], %[temp4], %[temp5] \n\t" \
|
| + "subu %[temp4], %[temp4], %[temp5] \n\t" \
|
| + "sw %[temp7], " #E "(%[tmp]) \n\t" \
|
| + "sw %[temp2], " #H "(%[tmp]) \n\t" \
|
| + "sw %[temp8], " #F "(%[tmp]) \n\t" \
|
| + "sw %[temp0], " #G "(%[tmp]) \n\t" \
|
| + "sw %[temp1], " #E1 "(%[tmp]) \n\t" \
|
| + "sw %[temp3], " #H1 "(%[tmp]) \n\t" \
|
| + "sw %[temp6], " #F1 "(%[tmp]) \n\t" \
|
| + "sw %[temp4], " #G1 "(%[tmp]) \n\t"
|
|
|
| // macro for one vertical pass in Disto4x4 (TTransform)
|
| // two calls of function TTransform are merged into single one
|
| @@ -362,10 +367,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
|
| int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
|
|
|
| __asm__ volatile(
|
| - HORIZONTAL_PASS( 0, 1, 2, 3, 0, 4, 8, 12, 64, 68, 72, 76)
|
| - HORIZONTAL_PASS(16, 17, 18, 19, 16, 20, 24, 28, 80, 84, 88, 92)
|
| - HORIZONTAL_PASS(32, 33, 34, 35, 32, 36, 40, 44, 96, 100, 104, 108)
|
| - HORIZONTAL_PASS(48, 49, 50, 51, 48, 52, 56, 60, 112, 116, 120, 124)
|
| + HORIZONTAL_PASS(0, 0, 4, 8, 12, 64, 68, 72, 76)
|
| + HORIZONTAL_PASS(1, 16, 20, 24, 28, 80, 84, 88, 92)
|
| + HORIZONTAL_PASS(2, 32, 36, 40, 44, 96, 100, 104, 108)
|
| + HORIZONTAL_PASS(3, 48, 52, 56, 60, 112, 116, 120, 124)
|
| "mthi $zero \n\t"
|
| "mtlo $zero \n\t"
|
| VERTICAL_PASS( 0, 16, 32, 48, 64, 80, 96, 112, 0, 8, 16, 24)
|
| @@ -405,41 +410,41 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
|
|
|
| // macro for one horizontal pass in FTransform
|
| // temp0..temp15 holds tmp[0]..tmp[15]
|
| -// A..D - offsets in bytes to load from src and ref buffers
|
| +// A - offset in bytes to load from src and ref buffers
|
| // TEMP0..TEMP3 - registers for corresponding tmp elements
|
| -#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
|
| - "lw %[" #TEMP1 "], 0(%[args]) \n\t" \
|
| - "lw %[" #TEMP2 "], 4(%[args]) \n\t" \
|
| - "lbu %[temp16], " #A "(%[" #TEMP1 "]) \n\t" \
|
| - "lbu %[temp17], " #A "(%[" #TEMP2 "]) \n\t" \
|
| - "lbu %[temp18], " #B "(%[" #TEMP1 "]) \n\t" \
|
| - "lbu %[temp19], " #B "(%[" #TEMP2 "]) \n\t" \
|
| - "subu %[temp20], %[temp16], %[temp17] \n\t" \
|
| - "lbu %[temp16], " #C "(%[" #TEMP1 "]) \n\t" \
|
| - "lbu %[temp17], " #C "(%[" #TEMP2 "]) \n\t" \
|
| - "subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
|
| - "lbu %[temp18], " #D "(%[" #TEMP1 "]) \n\t" \
|
| - "lbu %[temp19], " #D "(%[" #TEMP2 "]) \n\t" \
|
| - "subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
|
| - "subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
|
| - "addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
|
| - "subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
|
| - "addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
|
| - "subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
|
| - "mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
|
| - "mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
|
| - "mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
|
| - "mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
|
| - "addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
|
| - "subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
|
| - "sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
|
| - "sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
|
| - "addiu %[temp16], %[temp16], 1812 \n\t" \
|
| - "addiu %[temp17], %[temp17], 937 \n\t" \
|
| - "addu %[temp16], %[temp16], %[temp19] \n\t" \
|
| - "subu %[temp17], %[temp17], %[temp18] \n\t" \
|
| - "sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
|
| - "sra %[" #TEMP3 "], %[temp17], 9 \n\t"
|
| +#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
|
| + "lw %[" #TEMP1 "], 0(%[args]) \n\t" \
|
| + "lw %[" #TEMP2 "], 4(%[args]) \n\t" \
|
| + "lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
|
| + "lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
|
| + "lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
|
| + "lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
|
| + "subu %[temp20], %[temp16], %[temp17] \n\t" \
|
| + "lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
|
| + "lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
|
| + "subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
|
| + "lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
|
| + "lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
|
| + "subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
|
| + "subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
|
| + "addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
|
| + "subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
|
| + "addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
|
| + "subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
|
| + "mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
|
| + "mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
|
| + "mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
|
| + "mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
|
| + "addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
|
| + "subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
|
| + "sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
|
| + "sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
|
| + "addiu %[temp16], %[temp16], 1812 \n\t" \
|
| + "addiu %[temp17], %[temp17], 937 \n\t" \
|
| + "addu %[temp16], %[temp16], %[temp19] \n\t" \
|
| + "subu %[temp17], %[temp17], %[temp18] \n\t" \
|
| + "sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
|
| + "sra %[" #TEMP3 "], %[temp17], 9 \n\t"
|
|
|
| // macro for one vertical pass in FTransform
|
| // temp0..temp15 holds tmp[0]..tmp[15]
|
| @@ -483,10 +488,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
| { (const int*)src, (const int*)ref, (const int*)out };
|
|
|
| __asm__ volatile(
|
| - HORIZONTAL_PASS( 0, 1, 2, 3, temp0, temp1, temp2, temp3)
|
| - HORIZONTAL_PASS(16, 17, 18, 19, temp4, temp5, temp6, temp7)
|
| - HORIZONTAL_PASS(32, 33, 34, 35, temp8, temp9, temp10, temp11)
|
| - HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
|
| + HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
|
| + HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
|
| + HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
|
| + HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
|
| "lw %[temp20], 8(%[args]) \n\t"
|
| VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
|
| VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
|
| @@ -508,118 +513,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
|
| #undef VERTICAL_PASS
|
| #undef HORIZONTAL_PASS
|
|
|
| -// Forward declaration.
|
| -extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
|
| -
|
| -int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
|
| - int n = res->first;
|
| - // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
|
| - int p0 = res->prob[n][ctx0][0];
|
| - const uint16_t* t = res->cost[n][ctx0];
|
| - int cost;
|
| - const int const_2 = 2;
|
| - const int const_255 = 255;
|
| - const int const_max_level = MAX_VARIABLE_LEVEL;
|
| - int res_cost;
|
| - int res_prob;
|
| - int res_coeffs;
|
| - int res_last;
|
| - int v_reg;
|
| - int b_reg;
|
| - int ctx_reg;
|
| - int cost_add, temp_1, temp_2, temp_3;
|
| -
|
| - if (res->last < 0) {
|
| - return VP8BitCost(0, p0);
|
| - }
|
| -
|
| - cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
|
| -
|
| - res_cost = (int)res->cost;
|
| - res_prob = (int)res->prob;
|
| - res_coeffs = (int)res->coeffs;
|
| - res_last = (int)res->last;
|
| -
|
| - __asm__ volatile(
|
| - ".set push \n\t"
|
| - ".set noreorder \n\t"
|
| -
|
| - "sll %[temp_1], %[n], 1 \n\t"
|
| - "addu %[res_coeffs], %[res_coeffs], %[temp_1] \n\t"
|
| - "slt %[temp_2], %[n], %[res_last] \n\t"
|
| - "bnez %[temp_2], 1f \n\t"
|
| - " li %[cost_add], 0 \n\t"
|
| - "b 2f \n\t"
|
| - " nop \n\t"
|
| - "1: \n\t"
|
| - "lh %[v_reg], 0(%[res_coeffs]) \n\t"
|
| - "addu %[b_reg], %[n], %[VP8EncBands] \n\t"
|
| - "move %[temp_1], %[const_max_level] \n\t"
|
| - "addu %[cost], %[cost], %[cost_add] \n\t"
|
| - "negu %[temp_2], %[v_reg] \n\t"
|
| - "slti %[temp_3], %[v_reg], 0 \n\t"
|
| - "movn %[v_reg], %[temp_2], %[temp_3] \n\t"
|
| - "lbu %[b_reg], 1(%[b_reg]) \n\t"
|
| - "li %[cost_add], 0 \n\t"
|
| -
|
| - "sltiu %[temp_3], %[v_reg], 2 \n\t"
|
| - "move %[ctx_reg], %[v_reg] \n\t"
|
| - "movz %[ctx_reg], %[const_2], %[temp_3] \n\t"
|
| - // cost += VP8LevelCost(t, v);
|
| - "slt %[temp_3], %[v_reg], %[const_max_level] \n\t"
|
| - "movn %[temp_1], %[v_reg], %[temp_3] \n\t"
|
| - "sll %[temp_2], %[v_reg], 1 \n\t"
|
| - "addu %[temp_2], %[temp_2], %[VP8LevelFixedCosts] \n\t"
|
| - "lhu %[temp_2], 0(%[temp_2]) \n\t"
|
| - "sll %[temp_1], %[temp_1], 1 \n\t"
|
| - "addu %[temp_1], %[temp_1], %[t] \n\t"
|
| - "lhu %[temp_3], 0(%[temp_1]) \n\t"
|
| - "addu %[cost], %[cost], %[temp_2] \n\t"
|
| -
|
| - // t = res->cost[b][ctx];
|
| - "sll %[temp_1], %[ctx_reg], 7 \n\t"
|
| - "sll %[temp_2], %[ctx_reg], 3 \n\t"
|
| - "addu %[cost], %[cost], %[temp_3] \n\t"
|
| - "addu %[temp_1], %[temp_1], %[temp_2] \n\t"
|
| - "sll %[temp_2], %[b_reg], 3 \n\t"
|
| - "sll %[temp_3], %[b_reg], 5 \n\t"
|
| - "sub %[temp_2], %[temp_3], %[temp_2] \n\t"
|
| - "sll %[temp_3], %[temp_2], 4 \n\t"
|
| - "addu %[temp_1], %[temp_1], %[temp_3] \n\t"
|
| - "addu %[temp_2], %[temp_2], %[res_cost] \n\t"
|
| - "addiu %[n], %[n], 1 \n\t"
|
| - "addu %[t], %[temp_1], %[temp_2] \n\t"
|
| - "slt %[temp_1], %[n], %[res_last] \n\t"
|
| - "bnez %[temp_1], 1b \n\t"
|
| - " addiu %[res_coeffs], %[res_coeffs], 2 \n\t"
|
| - "2: \n\t"
|
| -
|
| - ".set pop \n\t"
|
| - : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
|
| - [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
|
| - [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
|
| - : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
|
| - [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
|
| - [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
|
| - [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
|
| - [res_cost]"r"(res_cost)
|
| - : "memory"
|
| - );
|
| -
|
| - // Last coefficient is always non-zero
|
| - {
|
| - const int v = abs(res->coeffs[n]);
|
| - assert(v != 0);
|
| - cost += VP8LevelCost(t, v);
|
| - if (n < 15) {
|
| - const int b = VP8EncBands[n + 1];
|
| - const int ctx = (v == 1) ? 1 : 2;
|
| - const int last_p0 = res->prob[b][ctx][0];
|
| - cost += VP8BitCost(0, last_p0);
|
| - }
|
| - }
|
| - return cost;
|
| -}
|
| +#if !defined(WORK_AROUND_GCC)
|
|
|
| #define GET_SSE_INNER(A, B, C, D) \
|
| "lbu %[temp0], " #A "(%[a]) \n\t" \
|
| @@ -645,7 +539,6 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
|
| GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
|
| GET_SSE_INNER(D, D + 1, D + 2, D + 3)
|
|
|
| -#if !defined(WORK_AROUND_GCC)
|
| static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
| int count;
|
| int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
| @@ -653,29 +546,29 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
|
| __asm__ volatile(
|
| "mult $zero, $zero \n\t"
|
|
|
| - GET_SSE( 0, 4, 8, 12)
|
| - GET_SSE( 16, 20, 24, 28)
|
| - GET_SSE( 32, 36, 40, 44)
|
| - GET_SSE( 48, 52, 56, 60)
|
| - GET_SSE( 64, 68, 72, 76)
|
| - GET_SSE( 80, 84, 88, 92)
|
| - GET_SSE( 96, 100, 104, 108)
|
| - GET_SSE(112, 116, 120, 124)
|
| - GET_SSE(128, 132, 136, 140)
|
| - GET_SSE(144, 148, 152, 156)
|
| - GET_SSE(160, 164, 168, 172)
|
| - GET_SSE(176, 180, 184, 188)
|
| - GET_SSE(192, 196, 200, 204)
|
| - GET_SSE(208, 212, 216, 220)
|
| - GET_SSE(224, 228, 232, 236)
|
| - GET_SSE(240, 244, 248, 252)
|
| + GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
|
| + GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
|
| + GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
|
| + GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
|
| + GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
|
| + GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
|
| + GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
|
| + GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
|
| + GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
|
| + GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
|
| + GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
|
| + GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
|
| + GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
|
| + GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
|
| + GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
|
| + GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
|
|
|
| "mflo %[count] \n\t"
|
| : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
| [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
| [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
|
| : [a]"r"(a), [b]"r"(b)
|
| - : "memory", "hi" , "lo"
|
| + : "memory", "hi", "lo"
|
| );
|
| return count;
|
| }
|
| @@ -687,21 +580,21 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
|
| __asm__ volatile(
|
| "mult $zero, $zero \n\t"
|
|
|
| - GET_SSE( 0, 4, 8, 12)
|
| - GET_SSE( 16, 20, 24, 28)
|
| - GET_SSE( 32, 36, 40, 44)
|
| - GET_SSE( 48, 52, 56, 60)
|
| - GET_SSE( 64, 68, 72, 76)
|
| - GET_SSE( 80, 84, 88, 92)
|
| - GET_SSE( 96, 100, 104, 108)
|
| - GET_SSE(112, 116, 120, 124)
|
| + GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
|
| + GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
|
| + GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
|
| + GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
|
| + GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
|
| + GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
|
| + GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
|
| + GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
|
|
|
| "mflo %[count] \n\t"
|
| : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
| [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
| [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
|
| : [a]"r"(a), [b]"r"(b)
|
| - : "memory", "hi" , "lo"
|
| + : "memory", "hi", "lo"
|
| );
|
| return count;
|
| }
|
| @@ -713,17 +606,17 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
|
| __asm__ volatile(
|
| "mult $zero, $zero \n\t"
|
|
|
| - GET_SSE( 0, 4, 16, 20)
|
| - GET_SSE(32, 36, 48, 52)
|
| - GET_SSE(64, 68, 80, 84)
|
| - GET_SSE(96, 100, 112, 116)
|
| + GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
|
| + GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
|
| + GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
|
| + GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
|
|
|
| "mflo %[count] \n\t"
|
| : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
| [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
| [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
|
| : [a]"r"(a), [b]"r"(b)
|
| - : "memory", "hi" , "lo"
|
| + : "memory", "hi", "lo"
|
| );
|
| return count;
|
| }
|
| @@ -735,42 +628,45 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
|
| __asm__ volatile(
|
| "mult $zero, $zero \n\t"
|
|
|
| - GET_SSE(0, 16, 32, 48)
|
| + GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
|
|
|
| "mflo %[count] \n\t"
|
| : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
|
| [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
|
| [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
|
| : [a]"r"(a), [b]"r"(b)
|
| - : "memory", "hi" , "lo"
|
| + : "memory", "hi", "lo"
|
| );
|
| return count;
|
| }
|
|
|
| -#endif // WORK_AROUND_GCC
|
| +#undef GET_SSE
|
| +#undef GET_SSE_INNER
|
|
|
| -#undef GET_SSE_MIPS32
|
| -#undef GET_SSE_MIPS32_INNER
|
| -
|
| -#endif // WEBP_USE_MIPS32
|
| +#endif // !WORK_AROUND_GCC
|
|
|
| //------------------------------------------------------------------------------
|
| // Entry point
|
|
|
| extern void VP8EncDspInitMIPS32(void);
|
|
|
| -void VP8EncDspInitMIPS32(void) {
|
| -#if defined(WEBP_USE_MIPS32)
|
| +WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
|
| VP8ITransform = ITransform;
|
| + VP8FTransform = FTransform;
|
| VP8EncQuantizeBlock = QuantizeBlock;
|
| + VP8EncQuantize2Blocks = Quantize2Blocks;
|
| VP8TDisto4x4 = Disto4x4;
|
| VP8TDisto16x16 = Disto16x16;
|
| - VP8FTransform = FTransform;
|
| #if !defined(WORK_AROUND_GCC)
|
| VP8SSE16x16 = SSE16x16;
|
| VP8SSE8x8 = SSE8x8;
|
| VP8SSE16x8 = SSE16x8;
|
| VP8SSE4x4 = SSE4x4;
|
| #endif
|
| -#endif // WEBP_USE_MIPS32
|
| }
|
| +
|
| +#else // !WEBP_USE_MIPS32
|
| +
|
| +WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
|
| +
|
| +#endif // WEBP_USE_MIPS32
|
|
|