src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 525283002: Enable highQualityFilter_SSE2

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 525283002: Enable highQualityFilter_SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: fix mac gm test Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 Google Inc.	2 * Copyright 2013 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include <emmintrin.h>	8 #include <emmintrin.h>

9 #include "SkBitmap.h"	9 #include "SkBitmap.h"

10 #include "SkBitmapFilter_opts_SSE2.h"	10 #include "SkBitmapFilter_opts_SSE2.h"

(...skipping 28 matching lines...) Expand all Loading...
39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);	39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);

40 }	40 }

41 #endif	41 #endif

42	42

43 // because the border is handled specially, this is guaranteed to have all 16 pi xels	43 // because the border is handled specially, this is guaranteed to have all 16 pi xels

44 // available to it without running off the bitmap's edge.	44 // available to it without running off the bitmap's edge.

45	45

46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,	46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,

47 SkPMColor* SK_RESTRICT colors, int count) {	47 SkPMColor* SK_RESTRICT colors, int count) {

48	48

49 const int maxX = s.fBitmap->width() - 1;	49 const int maxX = s.fBitmap->width();

50 const int maxY = s.fBitmap->height() - 1;	50 const int maxY = s.fBitmap->height();

	51 SkAutoTMalloc<SkScalar> xWeights(maxX);

51	52

52 while (count-- > 0) {	53 while (count-- > 0) {

53 SkPoint srcPt;	54 SkPoint srcPt;

54 s.fInvProc(s.fInvMatrix, SkIntToScalar(x),	55 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt);

55 SkIntToScalar(y), &srcPt);

56 srcPt.fX -= SK_ScalarHalf;	56 srcPt.fX -= SK_ScalarHalf;

57 srcPt.fY -= SK_ScalarHalf;	57 srcPt.fY -= SK_ScalarHalf;

58	58

59 int sx = SkScalarFloorToInt(srcPt.fX);

60 int sy = SkScalarFloorToInt(srcPt.fY);

61

62 __m128 weight = _mm_setzero_ps();	59 __m128 weight = _mm_setzero_ps();

63 __m128 accum = _mm_setzero_ps();	60 __m128 accum = _mm_setzero_ps();

64	61

65 int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f)));	62 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->widt h()), maxY);

66 int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f)) );	63 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->wid th()+1), maxY);

67 int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f)));	64 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->widt h()), maxX);

68 int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f)) );	65 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->wid th())+1, maxX);

69	66

70 for (int src_y = y0; src_y <= y1; src_y++) {	67 for (int srcX = x0; srcX < x1 ; srcX++) {

71 float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(sr cPt.fY - src_y));	68 // Looking these up once instead of each loop is a ~15% speedup.

	69 xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));

	70 }

72	71

73 for (int src_x = x0; src_x <= x1 ; src_x++) {	72 for (int srcY = y0; srcY < y1; srcY++) {

74 float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScala r(srcPt.fX - src_x));	73 SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - src Y));

75	74

76 float combined_weight = xweight * yweight;	75 for (int srcX = x0; srcX < x1 ; srcX++) {

	76 SkScalar xWeight = xWeights[srcX - x0];

77	77

78 SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y);	78 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);

79	79

80 __m128i c = _mm_cvtsi32_si128( color );	80 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);

	81

	82 __m128i c = _mm_cvtsi32_si128(color);

81 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());	83 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());

82 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());	84 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());

83	85 __m128 cfloat = _mm_cvtepi32_ps(c);

84 __m128 cfloat = _mm_cvtepi32_ps( c );

85	86

86 __m128 weightVector = _mm_set1_ps(combined_weight);	87 __m128 weightVector = _mm_set1_ps(combined_weight);

87

88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));	88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));

89 weight = _mm_add_ps( weight, weightVector );	89 weight = _mm_add_ps( weight, weightVector );

90 }	90 }

91 }	91 }

92	92

93 accum = _mm_div_ps(accum, weight);	93 accum = _mm_div_ps(accum, weight);

94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));	94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));

	95 __m128i accumInt = _mm_cvttps_epi32(accum);

	96 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128());

	97 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128());

	98 SkPMColor c = _mm_cvtsi128_si32(accumInt);

95	99

96 __m128i accumInt = _mm_cvtps_epi32( accum );	100 int a = SkClampMax(SkGetPackedA32(c), 255);

97	101 int r = SkClampMax(SkGetPackedR32(c), a);

98 int localResult[4];	102 int g = SkClampMax(SkGetPackedG32(c), a);

99 _mm_storeu_si128((__m128i *) (localResult), accumInt);	103 int b = SkClampMax(SkGetPackedB32(c), a);

100 int a = SkClampMax(localResult[0], 255);

101 int r = SkClampMax(localResult[1], a);

102 int g = SkClampMax(localResult[2], a);

103 int b = SkClampMax(localResult[3], a);

104	104

105 *colors++ = SkPackARGB32(a, r, g, b);	105 *colors++ = SkPackARGB32(a, r, g, b);

106	106

107 x++;	107 x++;

108 }	108 }

109 }	109 }

110	110

111 // Convolves horizontally along a single row. The row data is given in	111 // Convolves horizontally along a single row. The row data is given in

112 // \|src_data\| and continues for the num_values() of the filter.	112 // \|src_data\| and continues for the num_values() of the filter.

113 void convolveHorizontally_SSE2(const unsigned char* src_data,	113 void convolveHorizontally_SSE2(const unsigned char* src_data,

(...skipping 439 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {	553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {

554 // Padding \|paddingCount\| of more dummy coefficients after the coefficients	554 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes	555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

556 // together to access invalid memory areas. We are not trying to align the	556 // together to access invalid memory areas. We are not trying to align the

557 // coefficients right now due to the opaqueness of <vector> implementation.	557 // coefficients right now due to the opaqueness of <vector> implementation.

558 // This has to be done after all \|AddFilter\| calls.	558 // This has to be done after all \|AddFilter\| calls.

559 for (int i = 0; i < 8; ++i) {	559 for (int i = 0; i < 8; ++i) {

560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));	560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

561 }	561 }

562 }	562 }

OLD	NEW

« no previous file with comments | « no previous file | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »