src/opts/SkBitmapFilter_opts_SSE2.cpp - Issue 759603002: Optimize highQualityFilter

Side by Side Diff: src/opts/SkBitmapFilter_opts_SSE2.cpp

Issue 759603002: Optimize highQualityFilter (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: add const Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 Google Inc.	2 * Copyright 2013 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include <emmintrin.h>	8 #include <emmintrin.h>

9 #include "SkBitmap.h"	9 #include "SkBitmap.h"

10 #include "SkBitmapFilter_opts_SSE2.h"	10 #include "SkBitmapFilter_opts_SSE2.h"

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
42	42

43 // because the border is handled specially, this is guaranteed to have all 16 pi xels	43 // because the border is handled specially, this is guaranteed to have all 16 pi xels

44 // available to it without running off the bitmap's edge.	44 // available to it without running off the bitmap's edge.

45	45

46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,	46 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,

47 SkPMColor* SK_RESTRICT colors, int count) {	47 SkPMColor* SK_RESTRICT colors, int count) {

48	48

49 const int maxX = s.fBitmap->width();	49 const int maxX = s.fBitmap->width();

50 const int maxY = s.fBitmap->height();	50 const int maxY = s.fBitmap->height();

51 SkAutoTMalloc<SkScalar> xWeights(maxX);	51 SkAutoTMalloc<SkScalar> xWeights(maxX);

	52 const SkBitmapFilter* filter = s.getBitmapFilter();

52	53

53 while (count-- > 0) {	54 while (count-- > 0) {

54 SkPoint srcPt;	55 SkPoint srcPt;

55 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt);	56 s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt);

56 srcPt.fX -= SK_ScalarHalf;	57 srcPt.fX -= SK_ScalarHalf;

57 srcPt.fY -= SK_ScalarHalf;	58 srcPt.fY -= SK_ScalarHalf;

58	59

59 __m128 weight = _mm_setzero_ps();	60 __m128 weight = _mm_setzero_ps();

60 __m128 accum = _mm_setzero_ps();	61 __m128 accum = _mm_setzero_ps();

61	62

62 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->widt h()), maxY);	63 int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY - filter->width()), maxY) ;

63 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->wid th()+1), maxY);	64 int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY + filter->width() + 1), maxY);

64 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->widt h()), maxX);	65 int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX - filter->width()), maxX) ;

65 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->wid th())+1, maxX);	66 int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX + filter->width()) + 1, maxX);

66	67

67 for (int srcX = x0; srcX < x1 ; srcX++) {	68 for (int srcX = x0; srcX < x1 ; srcX++) {

68 // Looking these up once instead of each loop is a ~15% speedup.	69 // Looking these up once instead of each loop is a ~15% speedup.

69 xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));	70 xWeights[srcX - x0] = filter->lookupScalar((srcPt.fX - srcX));

70 }	71 }

71	72

72 for (int srcY = y0; srcY < y1; srcY++) {	73 for (int srcY = y0; srcY < y1; srcY++) {

73 SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - src Y));	74 SkScalar yWeight = filter->lookupScalar((srcPt.fY - srcY));

74	75

75 for (int srcX = x0; srcX < x1 ; srcX++) {	76 for (int srcX = x0; srcX < x1 ; srcX++) {

76 SkScalar xWeight = xWeights[srcX - x0];	77 SkScalar xWeight = xWeights[srcX - x0];

77	78

78 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);	79 SkScalar combined_weight = SkScalarMul(xWeight, yWeight);

	80 __m128 weightVector = _mm_set1_ps(combined_weight);

	81 weight = _mm_add_ps( weight, weightVector );

79	82

80 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);	83 SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);

	84 if (!color) {

	85 continue;

	86 }

81	87

82 __m128i c = _mm_cvtsi32_si128(color);	88 __m128i c = _mm_cvtsi32_si128(color);

83 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());	89 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());

84 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());	90 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());

85 __m128 cfloat = _mm_cvtepi32_ps(c);	91 __m128 cfloat = _mm_cvtepi32_ps(c);

86	92

87 __m128 weightVector = _mm_set1_ps(combined_weight);

88 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));	93 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));

89 weight = _mm_add_ps( weight, weightVector );

90 }	94 }

91 }	95 }

92	96

93 accum = _mm_div_ps(accum, weight);	97 accum = _mm_div_ps(accum, weight);

94 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));	98 accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));

95 __m128i accumInt = _mm_cvttps_epi32(accum);	99 __m128i accumInt = _mm_cvttps_epi32(accum);

96 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128());	100 accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128());

97 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128());	101 accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128());

98 SkPMColor c = _mm_cvtsi128_si32(accumInt);	102 SkPMColor c = _mm_cvtsi128_si32(accumInt);

99	103

(...skipping 453 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
553 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {	557 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {

554 // Padding \|paddingCount\| of more dummy coefficients after the coefficients	558 // Padding \|paddingCount\| of more dummy coefficients after the coefficients

555 // of last filter to prevent SIMD instructions which load 8 or 16 bytes	559 // of last filter to prevent SIMD instructions which load 8 or 16 bytes

556 // together to access invalid memory areas. We are not trying to align the	560 // together to access invalid memory areas. We are not trying to align the

557 // coefficients right now due to the opaqueness of <vector> implementation.	561 // coefficients right now due to the opaqueness of <vector> implementation.

558 // This has to be done after all \|AddFilter\| calls.	562 // This has to be done after all \|AddFilter\| calls.

559 for (int i = 0; i < 8; ++i) {	563 for (int i = 0; i < 8; ++i) {

560 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));	564 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix ed>(0));

561 }	565 }

562 }	566 }

OLD	NEW

« src/core/SkBitmapFilter.cpp ('K') | « src/core/SkBitmapFilter.cpp ('k') | no next file » | no next file with comments »