OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2014 The Android Open Source Project | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #ifdef CRBUG_399842_FIXED | |
9 | |
10 #if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC)) | |
11 | |
12 #define CFI_PUSH(REG) \ | |
13 .cfi_adjust_cfa_offset 4; \ | |
14 .cfi_rel_offset REG, 0 | |
15 | |
16 #define CFI_POP(REG) \ | |
17 .cfi_adjust_cfa_offset -4; \ | |
18 .cfi_restore REG | |
19 | |
20 #define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
21 #define POP(REG) popl REG; CFI_POP (REG) | |
22 #define RETURN POP(%edi); ret | |
23 | |
24 #define EXTRACT_ALPHA(var1, var2) \ | |
25 movdqa %var1, %var2; /* Clone source pixels to extract alpha
*/\ | |
26 psrlw $8, %var2; /* Discard red and blue, leaving alpha a
nd green */\ | |
27 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ | |
28 movdqa %xmm6, %xmm4; \ | |
29 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ | |
30 movdqa %xmm5, %xmm3; \ | |
31 psubw %var2, %xmm4 /* Finalize alpha calculations */ | |
32 | |
33 #define SCALE_PIXELS \ | |
34 psllw $8, %xmm5; /* Filter out red and blue components */
\ | |
35 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ | |
36 psrlw $8, %xmm3; /* Filter out alpha and green components
*/\ | |
37 pmullw %xmm4, %xmm3 /* Scale alpha and green */ | |
38 | |
39 | |
40 /* | |
41 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
42 * const SkPMColor* SK_RESTRICT src, | |
43 * int count, U8CPU alpha) | |
44 * | |
45 * This function is divided into six blocks: initialization, blit 4-15 pixels, | |
46 * blit 0-3 pixels, align destination for 16+ pixel blits, | |
47 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. | |
48 * There are some code reuse between the blocks. | |
49 * | |
50 * The primary optimization comes from checking the source pixels' alpha value. | |
51 * If the alpha is zero, the pixel can be skipped entirely. | |
52 * If the alpha is fully opaque, the pixel can be copied directly to the destina
tion. | |
53 * According to collected statistics, these two cases are the most common. | |
54 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
55 * memory latency worse-case. | |
56 */ | |
57 | |
58 #ifdef __clang__ | |
59 .text | |
60 #else | |
61 .section .text.sse4.2,"ax",@progbits | |
62 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
63 #endif | |
64 .p2align 4 | |
65 #if defined(SK_BUILD_FOR_MAC) | |
66 .global _S32A_Opaque_BlitRow32_SSE4_asm | |
67 .private_extern _S32A_Opaque_BlitRow32_SSE4_asm | |
68 _S32A_Opaque_BlitRow32_SSE4_asm: | |
69 #else | |
70 .global S32A_Opaque_BlitRow32_SSE4_asm | |
71 .hidden S32A_Opaque_BlitRow32_SSE4_asm | |
72 S32A_Opaque_BlitRow32_SSE4_asm: | |
73 #endif | |
74 .cfi_startproc | |
75 movl 8(%esp), %eax // Source pointer | |
76 movl 12(%esp), %ecx // Pixel count | |
77 movl 4(%esp), %edx // Destination pointer | |
78 prefetcht0 (%eax) | |
79 | |
80 // Setup SSE constants | |
81 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha | |
82 pslld $24, %xmm7 | |
83 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha | |
84 psrlw $15, %xmm6 | |
85 psllw $8, %xmm6 | |
86 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca
use of pblendvb) | |
87 psrlw $8, %xmm0 | |
88 subl $4, %ecx // Check if we have only 0-3 pixels | |
89 js .LReallySmall | |
90 PUSH(%edi) | |
91 cmpl $11, %ecx // Do we have enough pixels to run the m
ain loop? | |
92 ja .LBigBlit | |
93 | |
94 // Handle small blits (4-15 pixels) | |
95 ////////////////////////////////////////////////////////////////////////////
//// | |
96 xorl %edi, %edi // Reset offset to zero | |
97 | |
98 .LSmallLoop: | |
99 lddqu (%eax, %edi), %xmm1 // Load four source pixels | |
100 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
101 ja .LSmallAlphaNotOpaqueOrZero | |
102 jz .LSmallAlphaZero // If all alphas are zero, skip the pixe
ls completely | |
103 movdqu %xmm1, (%edx, %edi) // Store four destination pixels | |
104 .LSmallAlphaZero: | |
105 addl $16, %edi | |
106 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
107 jns .LSmallLoop | |
108 jmp .LSmallRemaining | |
109 | |
110 // Handle mixed alphas (calculate and scale) | |
111 .p2align 4 | |
112 .LSmallAlphaNotOpaqueOrZero: | |
113 lddqu (%edx, %edi), %xmm5 // Load four destination pixels | |
114 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
115 SCALE_PIXELS // Scale pixels using alpha | |
116 | |
117 addl $16, %edi | |
118 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
119 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly | |
120 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
121 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels | |
122 jns .LSmallLoop | |
123 | |
124 // Handle the last 0-3 pixels (also used by the main loops) | |
125 .LSmallRemaining: | |
126 cmpl $-4, %ecx // Check if we are done | |
127 je .LSmallExit | |
128 sall $2, %ecx // Calculate offset for last pixels | |
129 addl %ecx, %edi | |
130 | |
131 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap
ping) | |
132 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
133 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
(overlapping) | |
134 jz .LSmallExit // If all alphas are zero, skip the pixe
ls completely | |
135 | |
136 // Handle mixed alphas (calculate and scale) | |
137 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov
erlapping) | |
138 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
139 | |
140 psllw $8, %xmm3 // Filter out red and blue components | |
141 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
142 movdqa %xmm5, %xmm2 | |
143 psrlw $8, %xmm2 // Filter out alpha and green components | |
144 pmullw %xmm4, %xmm2 // Scale alpha and green | |
145 | |
146 cmpl $-8, %ecx // Check how many pixels should be writt
en | |
147 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) | |
148 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
149 jb .LSmallPixelsLeft1 | |
150 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp
ing pixels... | |
151 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th
e destination | |
152 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels | |
153 .LSmallExit: | |
154 RETURN | |
155 | |
156 .LSmallPixelsLeft1: | |
157 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des
tination | |
158 movdqu %xmm5, (%edx, %edi) // Store last destination pixel | |
159 RETURN | |
160 | |
161 .LSmallPixelsLeft3: | |
162 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to
the destination | |
163 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels | |
164 RETURN | |
165 | |
166 .LSmallRemainingStoreAll: | |
167 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr
ite) | |
168 RETURN | |
169 | |
170 // Handle really small blits (0-3 pixels) | |
171 ////////////////////////////////////////////////////////////////////////////
//// | |
172 .LReallySmall: | |
173 addl $4, %ecx | |
174 jle .LReallySmallExit | |
175 pcmpeqd %xmm1, %xmm1 | |
176 cmp $2, %ecx // Check how many pixels should be read | |
177 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel | |
178 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel | |
179 jb .LReallySmallCalc | |
180 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel | |
181 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel | |
182 je .LReallySmallCalc | |
183 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel | |
184 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel | |
185 | |
186 .LReallySmallCalc: | |
187 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
188 jc .LReallySmallStore // If all alphas are opaque, just store | |
189 | |
190 // Handle mixed alphas (calculate and scale) | |
191 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
192 | |
193 pand %xmm0, %xmm5 // Filter out red and blue components | |
194 pmullw %xmm4, %xmm5 // Scale red and blue | |
195 psrlw $8, %xmm3 // Filter out alpha and green components | |
196 pmullw %xmm4, %xmm3 // Scale alpha and green | |
197 | |
198 psrlw $8, %xmm5 // Combine results | |
199 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly | |
200 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
201 | |
202 .LReallySmallStore: | |
203 cmp $2, %ecx // Check how many pixels should be writt
en | |
204 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel | |
205 jb .LReallySmallExit | |
206 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel | |
207 je .LReallySmallExit | |
208 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel | |
209 .LReallySmallExit: | |
210 ret | |
211 | |
212 // Handle bigger blit operations (16+ pixels) | |
213 ////////////////////////////////////////////////////////////////////////////
//// | |
214 .p2align 4 | |
215 .LBigBlit: | |
216 // Align destination? | |
217 testl $0xF, %edx | |
218 lddqu (%eax), %xmm1 // Pre-load four source pixels | |
219 jz .LAligned | |
220 | |
221 movl %edx, %edi // Calculate alignment of destination po
inter | |
222 negl %edi | |
223 andl $0xF, %edi | |
224 | |
225 // Handle 1-3 pixels to align destination | |
226 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
227 jz .LAlignDone // If all alphas are zero, just skip | |
228 lddqu (%edx), %xmm5 // Load four destination pixels | |
229 jc .LAlignStore // If all alphas are opaque, just store | |
230 | |
231 // Handle mixed alphas (calculate and scale) | |
232 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
233 | |
234 psllw $8, %xmm3 // Filter out red and blue components | |
235 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
236 movdqa %xmm5, %xmm2 | |
237 psrlw $8, %xmm2 // Filter out alpha and green components | |
238 pmullw %xmm4, %xmm2 // Scale alpha and green | |
239 | |
240 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) | |
241 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
242 | |
243 .LAlignStore: | |
244 cmp $8, %edi // Check how many pixels should be writt
en | |
245 jb .LAlignPixelsLeft1 | |
246 ja .LAlignPixelsLeft3 | |
247 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
248 jmp .LAlignStorePixels | |
249 | |
250 .LAlignPixelsLeft1: | |
251 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
252 jmp .LAlignStorePixels | |
253 | |
254 .LAlignPixelsLeft3: | |
255 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
256 | |
257 .LAlignStorePixels: | |
258 movdqu %xmm5, (%edx) // Store destination pixels | |
259 | |
260 .LAlignDone: | |
261 addl %edi, %eax // Adjust pointers and pixel count | |
262 addl %edi, %edx | |
263 shrl $2, %edi | |
264 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali
gnment) | |
265 subl %edi, %ecx | |
266 | |
267 .LAligned: // Destination is guaranteed to be 16 by
te aligned | |
268 xorl %edi, %edi // Reset offset to zero | |
269 subl $8, %ecx // Decrease counter (Reserve four pixels
for the cleanup) | |
270 testl $0xF, %eax // Check alignment of source pointer | |
271 jz .LAlignedLoop | |
272 | |
273 // Source not aligned to destination | |
274 ////////////////////////////////////////////////////////////////////////////
//// | |
275 .p2align 4 | |
276 .LUnalignedLoop: // Main loop for unaligned, handles eigh
t pixels per iteration | |
277 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
278 ja .LAlphaNotOpaqueOrZero00 | |
279 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
280 jz .LAlphaZero00 | |
281 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
282 | |
283 .LAlphaZero00: | |
284 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
285 ja .LAlphaNotOpaqueOrZero01 | |
286 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
287 jz .LAlphaZero01 | |
288 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
289 | |
290 .LAlphaZero01: | |
291 addl $32, %edi // Adjust offset and pixel count | |
292 subl $8, %ecx | |
293 jae .LUnalignedLoop | |
294 addl $8, %ecx // Adjust pixel count | |
295 jmp .LLoopCleanup0 | |
296 | |
297 .p2align 4 | |
298 .LAlphaNotOpaqueOrZero00: | |
299 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
300 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
301 SCALE_PIXELS // Scale pixels using alpha | |
302 | |
303 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
304 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
305 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
306 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
307 | |
308 // Handle next four pixels | |
309 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
310 ja .LAlphaNotOpaqueOrZero01 | |
311 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
312 jz .LAlphaZero02 | |
313 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
314 .LAlphaZero02: | |
315 addl $32, %edi // Adjust offset and pixel count | |
316 subl $8, %ecx | |
317 jae .LUnalignedLoop | |
318 addl $8, %ecx // Adjust pixel count | |
319 jmp .LLoopCleanup0 | |
320 | |
321 .p2align 4 | |
322 .LAlphaNotOpaqueOrZero01: | |
323 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
324 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
325 SCALE_PIXELS // Scale pixels using alpha | |
326 | |
327 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
328 addl $32, %edi | |
329 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
330 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
331 subl $8, %ecx | |
332 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
333 jae .LUnalignedLoop | |
334 addl $8, %ecx // Adjust pixel count | |
335 | |
336 // Cleanup - handle pending pixels from loop | |
337 .LLoopCleanup0: | |
338 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
339 ja .LAlphaNotOpaqueOrZero02 | |
340 jz .LAlphaZero03 | |
341 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
342 .LAlphaZero03: | |
343 addl $16, %edi | |
344 subl $4, %ecx | |
345 js .LSmallRemaining // Reuse code from small loop | |
346 | |
347 .LRemain0: | |
348 lddqu (%eax, %edi), %xmm1 // Load four source pixels | |
349 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
350 ja .LAlphaNotOpaqueOrZero02 | |
351 jz .LAlphaZero04 | |
352 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
353 .LAlphaZero04: | |
354 addl $16, %edi | |
355 subl $4, %ecx | |
356 jmp .LSmallRemaining // Reuse code from small loop | |
357 | |
358 .LAlphaNotOpaqueOrZero02: | |
359 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
360 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
361 SCALE_PIXELS // Scale pixels using alpha | |
362 | |
363 addl $16, %edi | |
364 subl $4, %ecx | |
365 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
366 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
367 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels | |
368 js .LSmallRemaining // Reuse code from small loop | |
369 jmp .LRemain0 | |
370 | |
371 // Source aligned to destination | |
372 ////////////////////////////////////////////////////////////////////////////
//// | |
373 .p2align 4 | |
374 .LAlignedLoop: // Main loop for aligned, handles eight
pixels per iteration | |
375 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
376 ja .LAlphaNotOpaqueOrZero10 | |
377 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
378 jz .LAlphaZero10 | |
379 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
380 | |
381 .LAlphaZero10: | |
382 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
383 ja .LAlphaNotOpaqueOrZero11 | |
384 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
385 jz .LAlphaZero11 | |
386 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
387 | |
388 .LAlphaZero11: | |
389 addl $32, %edi // Adjust offset and pixel count | |
390 subl $8, %ecx | |
391 jae .LAlignedLoop | |
392 addl $8, %ecx // Adjust pixel count | |
393 jmp .LLoopCleanup1 | |
394 | |
395 .p2align 4 | |
396 .LAlphaNotOpaqueOrZero10: | |
397 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
398 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
399 SCALE_PIXELS // Scale pixels using alpha | |
400 | |
401 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
402 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
403 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
404 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
405 | |
406 // Handle next four pixels | |
407 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
408 ja .LAlphaNotOpaqueOrZero11 | |
409 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
410 jz .LAlphaZero12 | |
411 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
412 .LAlphaZero12: | |
413 addl $32, %edi // Adjust offset and pixel count | |
414 subl $8, %ecx | |
415 jae .LAlignedLoop | |
416 addl $8, %ecx // Adjust pixel count | |
417 jmp .LLoopCleanup1 | |
418 | |
419 .p2align 4 | |
420 .LAlphaNotOpaqueOrZero11: | |
421 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
422 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
423 SCALE_PIXELS // Scale pixels using alpha | |
424 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
425 | |
426 addl $32, %edi | |
427 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
428 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
429 subl $8, %ecx | |
430 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
431 jae .LAlignedLoop | |
432 addl $8, %ecx // Adjust pixel count | |
433 | |
434 // Cleanup - handle pending pixels from loop | |
435 .LLoopCleanup1: | |
436 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
437 ja .LAlphaNotOpaqueOrZero12 | |
438 jz .LAlphaZero13 | |
439 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
440 .LAlphaZero13: | |
441 addl $16, %edi | |
442 subl $4, %ecx | |
443 js .LSmallRemaining // Reuse code from small loop | |
444 | |
445 .LRemain1: | |
446 movdqa (%eax, %edi), %xmm1 // Load four source pixels | |
447 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
448 ja .LAlphaNotOpaqueOrZero12 | |
449 jz .LAlphaZero14 | |
450 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
451 .LAlphaZero14: | |
452 addl $16, %edi | |
453 subl $4, %ecx | |
454 jmp .LSmallRemaining // Reuse code from small loop | |
455 | |
456 .LAlphaNotOpaqueOrZero12: | |
457 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
458 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
459 SCALE_PIXELS // Scale pixels using alpha | |
460 | |
461 addl $16, %edi | |
462 subl $4, %ecx | |
463 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
464 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
465 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels | |
466 js .LSmallRemaining // Reuse code from small loop | |
467 jmp .LRemain1 | |
468 | |
469 .cfi_endproc | |
470 #ifndef __clang__ | |
471 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
472 #endif | |
473 #endif | |
474 | |
475 #endif // CRBUG_399842_FIXED | |
OLD | NEW |