OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2014 The Android Open Source Project | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #ifdef CRBUG_399842_FIXED | |
9 | |
10 #if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC)) | |
11 | |
12 #define EXTRACT_ALPHA(var1, var2) \ | |
13 movdqa %var1, %var2; /* Clone source pixels to extract alpha
*/\ | |
14 psrlw $8, %var2; /* Discard red and blue, leaving alpha a
nd green */\ | |
15 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ | |
16 movdqa %xmm6, %xmm4; \ | |
17 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ | |
18 movdqa %xmm5, %xmm3; \ | |
19 psubw %var2, %xmm4 /* Finalize alpha calculations */ | |
20 | |
21 #define SCALE_PIXELS \ | |
22 psllw $8, %xmm5; /* Filter out red and blue components */
\ | |
23 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ | |
24 psrlw $8, %xmm3; /* Filter out alpha and green components
*/\ | |
25 pmullw %xmm4, %xmm3 /* Scale alpha and green */ | |
26 | |
27 | |
28 /* | |
29 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
30 * const SkPMColor* SK_RESTRICT src, | |
31 * int count, U8CPU alpha) | |
32 * | |
33 * This function is divided into six blocks: initialization, blit 4-15 pixels, | |
34 * blit 0-3 pixels, align destination for 16+ pixel blits, | |
35 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. | |
36 * There are some code reuse between the blocks. | |
37 * | |
38 * The primary optimization comes from checking the source pixels' alpha value. | |
39 * If the alpha is zero, the pixel can be skipped entirely. | |
40 * If the alpha is fully opaque, the pixel can be copied directly to the destina
tion. | |
41 * According to collected statistics, these two cases are the most common. | |
42 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
43 * memory latency worse-case. | |
44 */ | |
45 | |
46 #ifdef __clang__ | |
47 .text | |
48 #else | |
49 .section .text.sse4.2,"ax",@progbits | |
50 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
51 #endif | |
52 .p2align 4 | |
53 #if defined(SK_BUILD_FOR_MAC) | |
54 .global _S32A_Opaque_BlitRow32_SSE4_asm | |
55 .private_extern _S32A_Opaque_BlitRow32_SSE4_asm | |
56 _S32A_Opaque_BlitRow32_SSE4_asm: | |
57 #else | |
58 .global S32A_Opaque_BlitRow32_SSE4_asm | |
59 .hidden S32A_Opaque_BlitRow32_SSE4_asm | |
60 S32A_Opaque_BlitRow32_SSE4_asm: | |
61 #endif | |
62 .cfi_startproc | |
63 prefetcht0 (%rsi) | |
64 movl %edx, %ecx // Pixel count | |
65 movq %rdi, %rdx // Destination pointer | |
66 movq %rsi, %rax // Source pointer | |
67 | |
68 // Setup SSE constants | |
69 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha | |
70 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a
lpha | |
71 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm
m0 because of pblendvb) | |
72 | |
73 subl $4, %ecx // Check if we have only 0-3 pixels | |
74 js .LReallySmall | |
75 cmpl $11, %ecx // Do we have enough pixels to run the m
ain loop? | |
76 ja .LBigBlit | |
77 | |
78 // Handle small blits (4-15 pixels) | |
79 ////////////////////////////////////////////////////////////////////////////
//// | |
80 xorq %rdi, %rdi // Reset offset to zero | |
81 | |
82 .LSmallLoop: | |
83 lddqu (%rax, %rdi), %xmm1 // Load four source pixels | |
84 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
85 ja .LSmallAlphaNotOpaqueOrZero | |
86 jz .LSmallAlphaZero | |
87 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels | |
88 .LSmallAlphaZero: | |
89 addq $16, %rdi | |
90 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
91 jns .LSmallLoop | |
92 jmp .LSmallRemaining | |
93 | |
94 // Handle mixed alphas (calculate and scale) | |
95 .p2align 4 | |
96 .LSmallAlphaNotOpaqueOrZero: | |
97 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels | |
98 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
99 SCALE_PIXELS // Scale pixels using alpha | |
100 | |
101 addq $16, %rdi | |
102 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
103 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly | |
104 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
105 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
106 jns .LSmallLoop | |
107 | |
108 // Handle the last 0-3 pixels (also used by the main loops) | |
109 .LSmallRemaining: | |
110 cmpl $-4, %ecx // Check if we are done | |
111 je .LSmallExit | |
112 sall $2, %ecx // Calculate offset for last pixels | |
113 movslq %ecx, %rcx | |
114 addq %rcx, %rdi | |
115 | |
116 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap
ping) | |
117 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
118 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
(overlapping) | |
119 jz .LSmallExit // If all alphas are zero, skip the pixe
ls completely | |
120 | |
121 // Handle mixed alphas (calculate and scale) | |
122 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov
erlapping) | |
123 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
124 | |
125 psllw $8, %xmm3 // Filter out red and blue components | |
126 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
127 movdqa %xmm5, %xmm2 | |
128 psrlw $8, %xmm2 // Filter out alpha and green components | |
129 pmullw %xmm4, %xmm2 // Scale alpha and green | |
130 | |
131 cmpl $-8, %ecx // Check how many pixels should be writt
en | |
132 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) | |
133 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
134 jb .LSmallPixelsLeft1 | |
135 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp
ing pixels... | |
136 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th
e destination | |
137 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels | |
138 .LSmallExit: | |
139 ret | |
140 | |
141 .LSmallPixelsLeft1: | |
142 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des
tination | |
143 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel | |
144 ret | |
145 | |
146 .LSmallPixelsLeft3: | |
147 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to
the destination | |
148 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels | |
149 ret | |
150 | |
151 .LSmallRemainingStoreAll: | |
152 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr
ite) | |
153 ret | |
154 | |
155 // Handle really small blits (0-3 pixels) | |
156 ////////////////////////////////////////////////////////////////////////////
//// | |
157 .LReallySmall: | |
158 addl $4, %ecx | |
159 jle .LReallySmallExit | |
160 pcmpeqd %xmm1, %xmm1 | |
161 cmpl $2, %ecx // Check how many pixels should be read | |
162 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel | |
163 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel | |
164 jb .LReallySmallCalc | |
165 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel | |
166 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel | |
167 je .LReallySmallCalc | |
168 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel | |
169 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel | |
170 | |
171 .LReallySmallCalc: | |
172 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
173 jc .LReallySmallStore // If all alphas are opaque, just store | |
174 | |
175 // Handle mixed alphas (calculate and scale) | |
176 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
177 | |
178 pand %xmm0, %xmm5 // Filter out red and blue components | |
179 pmullw %xmm4, %xmm5 // Scale red and blue | |
180 psrlw $8, %xmm3 // Filter out alpha and green components | |
181 pmullw %xmm4, %xmm3 // Scale alpha and green | |
182 | |
183 psrlw $8, %xmm5 // Combine results | |
184 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly | |
185 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
186 | |
187 .LReallySmallStore: | |
188 cmpl $2, %ecx // Check how many pixels should be writt
en | |
189 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel | |
190 jb .LReallySmallExit | |
191 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel | |
192 je .LReallySmallExit | |
193 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel | |
194 .LReallySmallExit: | |
195 ret | |
196 | |
197 // Handle bigger blit operations (16+ pixels) | |
198 ////////////////////////////////////////////////////////////////////////////
//// | |
199 .p2align 4 | |
200 .LBigBlit: | |
201 // Align destination? | |
202 testl $0xF, %edx | |
203 lddqu (%rax), %xmm1 // Pre-load four source pixels | |
204 jz .LAligned | |
205 | |
206 movq %rdx, %rdi // Calculate alignment of destination po
inter | |
207 negq %rdi | |
208 andl $0xF, %edi | |
209 | |
210 // Handle 1-3 pixels to align destination | |
211 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
212 jz .LAlignDone // If all alphas are zero, just skip | |
213 lddqu (%rdx), %xmm5 // Load four destination pixels | |
214 jc .LAlignStore // If all alphas are opaque, just store | |
215 | |
216 // Handle mixed alphas (calculate and scale) | |
217 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
218 | |
219 psllw $8, %xmm3 // Filter out red and blue components | |
220 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
221 movdqa %xmm5, %xmm2 | |
222 psrlw $8, %xmm2 // Filter out alpha and green components | |
223 pmullw %xmm4, %xmm2 // Scale alpha and green | |
224 | |
225 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) | |
226 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
227 | |
228 .LAlignStore: | |
229 cmpl $8, %edi // Check how many pixels should be writt
en | |
230 jb .LAlignPixelsLeft1 | |
231 ja .LAlignPixelsLeft3 | |
232 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
233 jmp .LAlignStorePixels | |
234 | |
235 .LAlignPixelsLeft1: | |
236 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
237 jmp .LAlignStorePixels | |
238 | |
239 .LAlignPixelsLeft3: | |
240 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
241 | |
242 .LAlignStorePixels: | |
243 movdqu %xmm5, (%rdx) // Store destination pixels | |
244 | |
245 .LAlignDone: | |
246 addq %rdi, %rax // Adjust pointers and pixel count | |
247 addq %rdi, %rdx | |
248 shrq $2, %rdi | |
249 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali
gnment) | |
250 subl %edi, %ecx | |
251 | |
252 .LAligned: // Destination is guaranteed to be 16 by
te aligned | |
253 xorq %rdi, %rdi // Reset offset to zero | |
254 subl $8, %ecx // Decrease counter (Reserve four pixels
for the cleanup) | |
255 testl $0xF, %eax // Check alignment of source pointer | |
256 jz .LAlignedLoop | |
257 | |
258 // Source not aligned to destination | |
259 ////////////////////////////////////////////////////////////////////////////
//// | |
260 .p2align 4 | |
261 .LUnalignedLoop: // Main loop for unaligned, handles eigh
t pixels per iteration | |
262 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
263 ja .LAlphaNotOpaqueOrZero00 | |
264 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
265 jz .LAlphaZero00 | |
266 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
267 | |
268 .LAlphaZero00: | |
269 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
270 ja .LAlphaNotOpaqueOrZero01 | |
271 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
272 jz .LAlphaZero01 | |
273 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
274 | |
275 .LAlphaZero01: | |
276 addq $32, %rdi // Adjust offset and pixel count | |
277 subl $8, %ecx | |
278 jae .LUnalignedLoop | |
279 addl $8, %ecx // Adjust pixel count | |
280 jmp .LLoopCleanup0 | |
281 | |
282 .p2align 4 | |
283 .LAlphaNotOpaqueOrZero00: | |
284 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
285 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
286 SCALE_PIXELS // Scale pixels using alpha | |
287 | |
288 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
289 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
290 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
291 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
292 | |
293 // Handle next four pixels | |
294 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
295 ja .LAlphaNotOpaqueOrZero01 | |
296 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
297 jz .LAlphaZero02 | |
298 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
299 .LAlphaZero02: | |
300 addq $32, %rdi // Adjust offset and pixel count | |
301 subl $8, %ecx | |
302 jae .LUnalignedLoop | |
303 addl $8, %ecx // Adjust pixel count | |
304 jmp .LLoopCleanup0 | |
305 | |
306 .p2align 4 | |
307 .LAlphaNotOpaqueOrZero01: | |
308 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
309 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
310 SCALE_PIXELS // Scale pixels using alpha | |
311 | |
312 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
313 addq $32, %rdi | |
314 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
315 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
316 subl $8, %ecx | |
317 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
318 jae .LUnalignedLoop | |
319 addl $8, %ecx // Adjust pixel count | |
320 | |
321 // Cleanup - handle pending pixels from loop | |
322 .LLoopCleanup0: | |
323 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
324 ja .LAlphaNotOpaqueOrZero02 | |
325 jz .LAlphaZero03 | |
326 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
327 .LAlphaZero03: | |
328 addq $16, %rdi | |
329 subl $4, %ecx | |
330 js .LSmallRemaining // Reuse code from small loop | |
331 | |
332 .LRemain0: | |
333 lddqu (%rax, %rdi), %xmm1 // Load four source pixels | |
334 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
335 ja .LAlphaNotOpaqueOrZero02 | |
336 jz .LAlphaZero04 | |
337 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
338 .LAlphaZero04: | |
339 addq $16, %rdi | |
340 subl $4, %ecx | |
341 jmp .LSmallRemaining // Reuse code from small loop | |
342 | |
343 .LAlphaNotOpaqueOrZero02: | |
344 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
345 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
346 SCALE_PIXELS // Scale pixels using alpha | |
347 | |
348 addq $16, %rdi | |
349 subl $4, %ecx | |
350 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
351 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
352 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
353 js .LSmallRemaining // Reuse code from small loop | |
354 jmp .LRemain0 | |
355 | |
356 // Source aligned to destination | |
357 ////////////////////////////////////////////////////////////////////////////
//// | |
358 .p2align 4 | |
359 .LAlignedLoop: // Main loop for aligned, handles eight
pixels per iteration | |
360 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
361 ja .LAlphaNotOpaqueOrZero10 | |
362 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
363 jz .LAlphaZero10 | |
364 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
365 | |
366 .LAlphaZero10: | |
367 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
368 ja .LAlphaNotOpaqueOrZero11 | |
369 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
370 jz .LAlphaZero11 | |
371 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
372 | |
373 .LAlphaZero11: | |
374 addq $32, %rdi // Adjust offset and pixel count | |
375 subl $8, %ecx | |
376 jae .LAlignedLoop | |
377 addl $8, %ecx // Adjust pixel count | |
378 jmp .LLoopCleanup1 | |
379 | |
380 .p2align 4 | |
381 .LAlphaNotOpaqueOrZero10: | |
382 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
383 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
384 SCALE_PIXELS // Scale pixels using alpha | |
385 | |
386 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
387 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
388 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
389 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
390 | |
391 // Handle next four pixels | |
392 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
393 ja .LAlphaNotOpaqueOrZero11 | |
394 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
395 jz .LAlphaZero12 | |
396 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
397 .LAlphaZero12: | |
398 addq $32, %rdi // Adjust offset and pixel count | |
399 subl $8, %ecx | |
400 jae .LAlignedLoop | |
401 addl $8, %ecx // Adjust pixel count | |
402 jmp .LLoopCleanup1 | |
403 | |
404 .p2align 4 | |
405 .LAlphaNotOpaqueOrZero11: | |
406 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
407 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
408 SCALE_PIXELS // Scale pixels using alpha | |
409 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
410 | |
411 addq $32, %rdi | |
412 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
413 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
414 subl $8, %ecx | |
415 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
416 jae .LAlignedLoop | |
417 addl $8, %ecx // Adjust pixel count | |
418 | |
419 // Cleanup - handle four pending pixels from loop | |
420 .LLoopCleanup1: | |
421 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
422 ja .LAlphaNotOpaqueOrZero12 | |
423 jz .LAlphaZero13 | |
424 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
425 .LAlphaZero13: | |
426 addq $16, %rdi | |
427 subl $4, %ecx | |
428 js .LSmallRemaining // Reuse code from small loop | |
429 | |
430 .LRemain1: | |
431 movdqa (%rax, %rdi), %xmm1 // Pre-load four source pixels | |
432 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
433 ja .LAlphaNotOpaqueOrZero12 | |
434 jz .LAlphaZero14 | |
435 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
436 .LAlphaZero14: | |
437 addq $16, %rdi | |
438 subl $4, %ecx | |
439 jmp .LSmallRemaining // Reuse code from small loop | |
440 | |
441 .LAlphaNotOpaqueOrZero12: | |
442 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
443 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
444 SCALE_PIXELS // Scale pixels using alpha | |
445 | |
446 addq $16, %rdi | |
447 subl $4, %ecx | |
448 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) | |
449 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
450 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
451 js .LSmallRemaining // Reuse code from small loop | |
452 jmp .LRemain1 | |
453 | |
454 .cfi_endproc | |
455 #ifndef __clang__ | |
456 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
457 #endif | |
458 | |
459 // Constants for SSE code | |
460 #ifndef __clang__ | |
461 .section .rodata | |
462 #endif | |
463 .p2align 4 | |
464 .LAlphaCheckMask: | |
465 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 | |
466 .LInverseAlphaCalc: | |
467 .word 256, 256, 256, 256, 256, 256, 256, 256 | |
468 .LResultMergeMask: | |
469 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF | |
470 #endif | |
471 | |
472 #endif // CRBUG_399842_FIXED | |
OLD | NEW |