OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright 2013 The Android Open Source Project | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #if !defined(_MSC_VER) | |
9 | |
10 #define CFI_PUSH(REG) \ | |
11 .cfi_adjust_cfa_offset 4; \ | |
12 .cfi_rel_offset REG, 0 | |
13 | |
14 #define CFI_POP(REG) \ | |
15 .cfi_adjust_cfa_offset -4; \ | |
16 .cfi_restore REG | |
17 | |
18 #define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
19 #define POP(REG) popl REG; CFI_POP (REG) | |
20 #define RETURN POP(%edi); ret | |
21 | |
22 #define EXTRACT_ALPHA(var1, var2) \ | |
23 movdqa %var1, %var2; /* Clone source pixels to extract alpha
*/\ | |
24 psrlw $8, %var2; /* Discard red and blue, leaving alpha a
nd green */\ | |
25 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ | |
26 movdqa %xmm6, %xmm4; \ | |
27 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ | |
28 movdqa %xmm5, %xmm3; \ | |
29 psubw %var2, %xmm4 /* Finalize alpha calculations */ | |
30 | |
31 #define SCALE_PIXELS \ | |
32 psllw $8, %xmm5; /* Filter out red and blue components */
\ | |
33 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ | |
34 psrlw $8, %xmm3; /* Filter out alpha and green components
*/\ | |
35 pmullw %xmm4, %xmm3 /* Scale alpha and green */ | |
36 | |
37 | |
38 /* | |
39 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
40 * const SkPMColor* SK_RESTRICT src, | |
41 * int count, U8CPU alpha) | |
42 * | |
43 * This function is divided into six blocks: initialization, blit 4-15 pixels, | |
44 * blit 0-3 pixels, align destination for 16+ pixel blits, | |
45 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. | |
46 * There are some code reuse between the blocks. | |
47 * | |
48 * The primary optimization comes from checking the source pixels' alpha value. | |
49 * If the alpha is zero, the pixel can be skipped entirely. | |
50 * If the alpha is fully opaque, the pixel can be copied directly to the destina
tion. | |
51 * According to collected statistics, these two cases are the most common. | |
52 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
53 * memory latency worse-case. | |
54 */ | |
55 | |
56 #ifdef __clang__ | |
57 .text | |
58 .global _S32A_Opaque_BlitRow32_SSE4_asm | |
59 #else | |
60 .section .text.sse4.2,"ax",@progbits | |
61 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
62 .global S32A_Opaque_BlitRow32_SSE4_asm | |
63 #endif | |
64 | |
65 .p2align 4 | |
66 _S32A_Opaque_BlitRow32_SSE4_asm: | |
67 S32A_Opaque_BlitRow32_SSE4_asm: | |
68 .cfi_startproc | |
69 movl 8(%esp), %eax // Source pointer | |
70 movl 12(%esp), %ecx // Pixel count | |
71 movl 4(%esp), %edx // Destination pointer | |
72 prefetcht0 (%eax) | |
73 | |
74 // Setup SSE constants | |
75 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha | |
76 pslld $24, %xmm7 | |
77 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha | |
78 psrlw $15, %xmm6 | |
79 psllw $8, %xmm6 | |
80 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca
use of pblendvb) | |
81 psrlw $8, %xmm0 | |
82 subl $4, %ecx // Check if we have only 0-3 pixels | |
83 js .LReallySmall | |
84 PUSH(%edi) | |
85 cmpl $11, %ecx // Do we have enough pixels to run the m
ain loop? | |
86 ja .LBigBlit | |
87 | |
88 // Handle small blits (4-15 pixels) | |
89 ////////////////////////////////////////////////////////////////////////////
//// | |
90 xorl %edi, %edi // Reset offset to zero | |
91 | |
92 .LSmallLoop: | |
93 lddqu (%eax, %edi), %xmm1 // Load four source pixels | |
94 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
95 ja .LSmallAlphaNotOpaqueOrZero | |
96 jz .LSmallAlphaZero // If all alphas are zero, skip the pixe
ls completely | |
97 movdqu %xmm1, (%edx, %edi) // Store four destination pixels | |
98 .LSmallAlphaZero: | |
99 addl $16, %edi | |
100 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
101 jns .LSmallLoop | |
102 jmp .LSmallRemaining | |
103 | |
104 // Handle mixed alphas (calculate and scale) | |
105 .p2align 4 | |
106 .LSmallAlphaNotOpaqueOrZero: | |
107 lddqu (%edx, %edi), %xmm5 // Load four destination pixels | |
108 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
109 SCALE_PIXELS // Scale pixels using alpha | |
110 | |
111 addl $16, %edi | |
112 subl $4, %ecx // Check if we can store all four pixels | |
113 pblendvb %xmm0, %xmm5, %xmm3 | |
114 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
115 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels | |
116 jns .LSmallLoop | |
117 | |
118 // Handle the last 0-3 pixels (also used by the big unaligned loop) | |
119 .LSmallRemaining: | |
120 cmpl $-4, %ecx // Check if we are done | |
121 je .LSmallExit | |
122 sall $2, %ecx // Calculate offset for last pixels | |
123 addl %ecx, %edi | |
124 | |
125 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap
ping) | |
126 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
127 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
(overlapping) | |
128 jz .LSmallExit // If all alphas are zero, skip the pixe
ls completely | |
129 | |
130 // Handle mixed alphas (calculate and scale) | |
131 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov
erlapping) | |
132 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
133 | |
134 psllw $8, %xmm3 // Filter out red and blue components | |
135 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
136 movdqa %xmm5, %xmm2 | |
137 psrlw $8, %xmm2 // Filter out alpha and green components | |
138 pmullw %xmm4, %xmm2 // Scale alpha and green | |
139 | |
140 cmpl $-8, %ecx // Check how many pixels should be writt
en | |
141 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
142 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
143 jb .LSmallPixelsLeft1 | |
144 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp
ing pixels... | |
145 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th
e destination | |
146 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels | |
147 .LSmallExit: | |
148 RETURN | |
149 | |
150 .LSmallPixelsLeft1: | |
151 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des
tination | |
152 movdqu %xmm5, (%edx, %edi) // Store last destination pixel | |
153 RETURN | |
154 | |
155 .LSmallPixelsLeft3: | |
156 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to
the destination | |
157 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels | |
158 RETURN | |
159 | |
160 .LSmallRemainingStoreAll: | |
161 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr
ite) | |
162 RETURN | |
163 | |
164 // Handle really small blits (0-3 pixels) | |
165 ////////////////////////////////////////////////////////////////////////////
//// | |
166 .LReallySmall: | |
167 addl $4, %ecx | |
168 jle .LReallySmallExit | |
169 pcmpeqd %xmm1, %xmm1 | |
170 cmp $2, %ecx // Check how many pixels should be read | |
171 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel | |
172 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel | |
173 jb .LReallySmallCalc | |
174 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel | |
175 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel | |
176 je .LReallySmallCalc | |
177 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel | |
178 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel | |
179 | |
180 .LReallySmallCalc: | |
181 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
182 jc .LReallySmallStore // If all alphas are opaque, just store | |
183 | |
184 // Handle mixed alphas (calculate and scale) | |
185 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
186 | |
187 pand %xmm0, %xmm5 // Filter out red and blue components | |
188 pmullw %xmm4, %xmm5 // Scale red and blue | |
189 psrlw $8, %xmm3 // Filter out alpha and green components | |
190 pmullw %xmm4, %xmm3 // Scale alpha and green | |
191 | |
192 psrlw $8, %xmm5 // Combine results | |
193 pblendvb %xmm0, %xmm5, %xmm3 | |
194 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
195 | |
196 .LReallySmallStore: | |
197 cmp $2, %ecx // Check how many pixels should be writt
en | |
198 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel | |
199 jb .LReallySmallExit | |
200 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel | |
201 je .LReallySmallExit | |
202 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel | |
203 .LReallySmallExit: | |
204 ret | |
205 | |
206 // Handle bigger blit operations (16+ pixels) | |
207 ////////////////////////////////////////////////////////////////////////////
//// | |
208 .p2align 4 | |
209 .LBigBlit: | |
210 // Align destination? | |
211 testl $0xF, %edx | |
212 lddqu (%eax), %xmm1 // Pre-load four source pixels | |
213 jz .LAligned | |
214 | |
215 movl %edx, %edi // Calculate alignment of destination po
inter | |
216 negl %edi | |
217 andl $0xF, %edi | |
218 | |
219 // Handle 1-3 pixels to align destination | |
220 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
221 jz .LAlignDone // If all alphas are zero, just skip | |
222 lddqu (%edx), %xmm5 // Load four destination pixels | |
223 jc .LAlignStore // If all alphas are opaque, just store | |
224 | |
225 // Handle mixed alphas (calculate and scale) | |
226 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
227 | |
228 psllw $8, %xmm3 // Filter out red and blue components | |
229 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
230 movdqa %xmm5, %xmm2 | |
231 psrlw $8, %xmm2 // Filter out alpha and green components | |
232 pmullw %xmm4, %xmm2 // Scale alpha and green | |
233 | |
234 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
235 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
236 | |
237 .LAlignStore: | |
238 cmp $8, %edi // Check how many pixels should be writt
en | |
239 jb .LAlignPixelsLeft1 | |
240 ja .LAlignPixelsLeft3 | |
241 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
242 jmp .LAlignStorePixels | |
243 | |
244 .LAlignPixelsLeft1: | |
245 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
246 jmp .LAlignStorePixels | |
247 | |
248 .LAlignPixelsLeft3: | |
249 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
250 | |
251 .LAlignStorePixels: | |
252 movdqu %xmm5, (%edx) // Store destination pixels | |
253 | |
254 .LAlignDone: | |
255 addl %edi, %eax // Adjust pointers and pixel count | |
256 addl %edi, %edx | |
257 shrl $2, %edi | |
258 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali
gnment) | |
259 subl %edi, %ecx | |
260 | |
261 .LAligned: // Destination is guaranteed to be 16 by
te aligned | |
262 xorl %edi, %edi // Reset offset to zero | |
263 subl $8, %ecx // Decrease counter (Reserve four pixels
for the cleanup) | |
264 testl $0xF, %eax // Check alignment of source pointer | |
265 jz .LAlignedLoop | |
266 | |
267 // Source not aligned to destination | |
268 ////////////////////////////////////////////////////////////////////////////
//// | |
269 .p2align 4 | |
270 .LUnalignedLoop: // Main loop for unaligned, handles eigh
t pixels per iteration | |
271 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
272 ja .LAlphaNotOpaqueOrZero00 | |
273 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
274 jz .LAlphaZero00 | |
275 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
276 | |
277 .LAlphaZero00: | |
278 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
279 ja .LAlphaNotOpaqueOrZero01 | |
280 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
281 jz .LAlphaZero01 | |
282 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
283 | |
284 .LAlphaZero01: | |
285 addl $32, %edi // Adjust offset and pixel count | |
286 subl $8, %ecx | |
287 jae .LUnalignedLoop | |
288 addl $8, %ecx // Adjust pixel count | |
289 jmp .LLoopCleanup0 | |
290 | |
291 .p2align 4 | |
292 .LAlphaNotOpaqueOrZero00: | |
293 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
294 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
295 SCALE_PIXELS // Scale pixels using alpha | |
296 | |
297 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
298 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
299 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
300 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
301 | |
302 // Handle next four pixels | |
303 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
304 ja .LAlphaNotOpaqueOrZero01 | |
305 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
306 jz .LAlphaZero02 | |
307 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
308 .LAlphaZero02: | |
309 addl $32, %edi // Adjust offset and pixel count | |
310 subl $8, %ecx | |
311 jae .LUnalignedLoop | |
312 addl $8, %ecx // Adjust pixel count | |
313 jmp .LLoopCleanup0 | |
314 | |
315 .p2align 4 | |
316 .LAlphaNotOpaqueOrZero01: | |
317 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
318 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
319 SCALE_PIXELS // Scale pixels using alpha | |
320 | |
321 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
322 addl $32, %edi | |
323 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
324 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
325 subl $8, %ecx | |
326 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
327 jae .LUnalignedLoop | |
328 addl $8, %ecx // Adjust pixel count | |
329 | |
330 // Cleanup - handle pending pixels from loop | |
331 .LLoopCleanup0: | |
332 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
333 ja .LAlphaNotOpaqueOrZero02 | |
334 jz .LAlphaZero03 | |
335 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
336 .LAlphaZero03: | |
337 addl $16, %edi | |
338 subl $4, %ecx | |
339 js .LSmallRemaining // Reuse code from small loop | |
340 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
341 jmp .LLoopCleanup0 | |
342 | |
343 .LAlphaNotOpaqueOrZero02: | |
344 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
345 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
346 SCALE_PIXELS // Scale pixels using alpha | |
347 | |
348 addl $16, %edi | |
349 subl $4, %ecx | |
350 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
351 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
352 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels | |
353 js .LSmallRemaining // Reuse code from small loop | |
354 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
355 jmp .LLoopCleanup0 | |
356 | |
357 // Source aligned to destination | |
358 ////////////////////////////////////////////////////////////////////////////
//// | |
359 .p2align 4 | |
360 .LAlignedLoop: // Main loop for aligned, handles eight
pixels per iteration | |
361 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
362 ja .LAlphaNotOpaqueOrZero10 | |
363 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
364 jz .LAlphaZero10 | |
365 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
366 | |
367 .LAlphaZero10: | |
368 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
369 ja .LAlphaNotOpaqueOrZero11 | |
370 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
371 jz .LAlphaZero11 | |
372 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
373 | |
374 .LAlphaZero11: | |
375 addl $32, %edi // Adjust offset and pixel count | |
376 subl $8, %ecx | |
377 jae .LAlignedLoop | |
378 jmp .LLoopCleanup1 | |
379 | |
380 .p2align 4 | |
381 .LAlphaNotOpaqueOrZero10: | |
382 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
383 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
384 SCALE_PIXELS // Scale pixels using alpha | |
385 | |
386 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
387 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
388 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
389 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
390 | |
391 // Handle next four pixels | |
392 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
393 ja .LAlphaNotOpaqueOrZero11 | |
394 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
395 jz .LAlphaZero12 | |
396 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
397 .LAlphaZero12: | |
398 addl $32, %edi // Adjust offset and pixel count | |
399 subl $8, %ecx | |
400 jae .LAlignedLoop | |
401 jmp .LLoopCleanup1 | |
402 | |
403 .p2align 4 | |
404 .LAlphaNotOpaqueOrZero11: | |
405 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
406 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
407 SCALE_PIXELS // Scale pixels using alpha | |
408 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
409 | |
410 addl $32, %edi | |
411 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
412 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
413 subl $8, %ecx | |
414 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
415 jae .LAlignedLoop | |
416 | |
417 // Cleanup - handle four pending pixels from loop | |
418 .LLoopCleanup1: | |
419 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
420 ja .LAlphaNotOpaqueOrZero12 | |
421 jz .LAlphaZero13 | |
422 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
423 .LAlphaZero13: | |
424 addl $8, %ecx // Adjust offset and pixel count | |
425 jz .LExit | |
426 addl $16, %edi | |
427 jmp .LRemainLoop1 | |
428 | |
429 .LAlphaNotOpaqueOrZero12: | |
430 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
431 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
432 SCALE_PIXELS // Scale pixels using alpha | |
433 | |
434 addl $8, %ecx // Adjust offset and pixel count | |
435 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
436 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
437 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
438 jz .LExit | |
439 addl $16, %edi | |
440 | |
441 // Handle last 1-7 pixels | |
442 .LRemainLoop1: | |
443 movdqa (%eax, %edi), %xmm1 // Load four source pixels | |
444 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
445 ja .LRemainAlphaNotOpaqueOrZero1 | |
446 jz .LRemainAlphaZero1 | |
447 | |
448 // All alphas were opaque (copy) | |
449 subl $4, %ecx // Check if we have more than four pixel
s left | |
450 jle .LRemainStore | |
451 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
452 addl $16, %edi | |
453 jmp .LRemainLoop1 | |
454 | |
455 // All alphas were zero (skip) | |
456 .p2align 4 | |
457 .LRemainAlphaZero1: | |
458 subl $4, %ecx // Check if we have more than four pixel
s left | |
459 jle .LExit | |
460 addl $16, %edi | |
461 jmp .LRemainLoop1 | |
462 | |
463 // Handle mixed alphas (calculate and scale) | |
464 .p2align 4 | |
465 .LRemainAlphaNotOpaqueOrZero1: | |
466 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
467 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
468 SCALE_PIXELS // Scale pixels using alpha | |
469 | |
470 subl $4, %ecx | |
471 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
472 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
473 jle .LRemainStore | |
474 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
475 addl $16, %edi | |
476 jmp .LRemainLoop1 | |
477 | |
478 // Store the last 1-4 pixels | |
479 .p2align 4 | |
480 .LRemainStore: | |
481 jz .LRemainFull | |
482 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
483 cmp $-2, %ecx // Check how many pixels should be writt
en | |
484 jb .LRemainPixelsLeft11 | |
485 ja .LRemainPixelsLeft13 | |
486 pblendw $0x0F, %xmm1, %xmm5 | |
487 movdqa %xmm5, (%edx, %edi) // Store last 2 destination pixels | |
488 .LExit: | |
489 RETURN | |
490 | |
491 .LRemainPixelsLeft11: | |
492 pblendw $0x03, %xmm1, %xmm5 | |
493 movdqa %xmm5, (%edx, %edi) // Store last destination pixel | |
494 RETURN | |
495 | |
496 .LRemainPixelsLeft13: | |
497 pblendw $0x3F, %xmm1, %xmm5 | |
498 movdqa %xmm5, (%edx, %edi) // Store last 3 destination pixels | |
499 RETURN | |
500 | |
501 .LRemainFull: | |
502 movdqa %xmm1, (%edx, %edi) // Store last 4 destination pixels | |
503 RETURN | |
504 | |
505 .cfi_endproc | |
506 #ifndef __clang__ | |
507 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
508 #endif | |
509 #endif | |
OLD | NEW |