OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright 2013 The Android Open Source Project | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 */ | |
7 | |
8 #if !defined(_MSC_VER) | |
9 | |
10 /* | |
11 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
12 * const SkPMColor* SK_RESTRICT src, | |
13 * int count, U8CPU alpha) | |
14 * | |
15 * The primary optimization comes from checking the source pixels' alpha value. | |
16 * If the alpha is zero, the pixel can be skipped entirely. | |
17 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion. | |
18 * According to collected statistics, these two cases are the most common. | |
19 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
20 * memory latency worse-case. | |
21 */ | |
22 | |
23 .section .text.sse4,"ax",@progbits | |
24 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
25 .globl S32A_Opaque_BlitRow32_SSE4_asm | |
26 | |
27 .p2align 4 | |
28 S32A_Opaque_BlitRow32_SSE4_asm: | |
29 .cfi_startproc | |
30 prefetcht0 (%rsi) | |
31 movl %edx, %ecx // Pixel count | |
32 movq %rdi, %rdx // Destination pointer | |
33 movq %rsi, %rax // Source pointer | |
34 | |
35 // Setup SSE constants | |
36 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha | |
37 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a lpha | |
38 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm m0 because of pblendvb) | |
39 | |
40 subl $4, %ecx // Check if we have only 0-3 pixels | |
41 js .LReallySmall | |
42 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop? | |
43 ja .LBigBlit | |
44 | |
45 // Handle small blits (4-15 pixels) | |
46 // ******************************** | |
47 xorq %rdi, %rdi // Reset offset to zero | |
48 | |
49 .LSmallLoop: | |
50 lddqu (%rax, %rdi), %xmm1 // Load four source pixels | |
51 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
52 ja .LSmallAlphaNotOpaqueOrZero | |
53 jz .LSmallAlphaZero | |
54 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels | |
55 .LSmallAlphaZero: | |
56 addq $16, %rdi | |
57 subl $4, %ecx // Check if there are four additional pi xels, at least | |
58 jns .LSmallLoop | |
59 jmp .LSmallRemaining | |
60 | |
61 // Handle mixed alphas (calculate and scale) | |
62 .p2align 4 | |
63 .LSmallAlphaNotOpaqueOrZero: | |
64 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels | |
65 | |
66 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
67 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
68 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
69 movdqa %xmm6, %xmm4 | |
70 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
71 movdqa %xmm5, %xmm3 | |
72 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
73 | |
74 psllw $8, %xmm5 // Filter out red and blue components | |
75 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
76 psrlw $8, %xmm3 // Filter out alpha and green components | |
77 pmullw %xmm4, %xmm3 // Scale alpha and green | |
78 | |
79 addq $16, %rdi | |
80 subl $4, %ecx // Check if we can store all four pixels | |
81 pblendvb %xmm0, %xmm5, %xmm3 | |
82 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
83 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
84 jns .LSmallLoop | |
85 | |
86 // Handle the last 0-3 pixels (also used by the big unaligned loop) | |
87 .LSmallRemaining: | |
88 cmpl $-4, %ecx // Check if we are done | |
89 je .LSmallExit | |
90 sall $2, %ecx // Calculate offset for last pixels | |
91 movslq %ecx, %rcx | |
92 addq %rcx, %rdi | |
93 | |
94 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap ping) | |
95 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
96 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store | |
97 jz .LSmallExit | |
98 | |
99 // Handle mixed alphas (calculate and scale) | |
100 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov erlapping) | |
101 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
102 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
103 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
104 movdqa %xmm6, %xmm4 | |
105 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
106 movdqa %xmm5, %xmm3 | |
107 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
108 | |
109 psllw $8, %xmm3 // Filter out red and blue components | |
110 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
111 movdqa %xmm5, %xmm2 | |
112 psrlw $8, %xmm2 // Filter out alpha and green components | |
113 pmullw %xmm4, %xmm2 // Scale alpha and green | |
114 | |
115 cmpl $-8, %ecx // Check how many pixels should be writt en | |
116 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
117 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
118 jb .LSmallPixelsLeft1 | |
119 ja .LSmallPixelsLeft3 | |
120 pblendw $0xF0, %xmm1, %xmm5 | |
121 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels | |
122 .LSmallExit: | |
123 ret | |
124 | |
125 .LSmallPixelsLeft1: | |
126 pblendw $0xC0, %xmm1, %xmm5 | |
127 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel | |
128 ret | |
129 | |
130 .LSmallPixelsLeft3: | |
131 pblendw $0xFC, %xmm1, %xmm5 | |
132 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels | |
133 ret | |
134 | |
135 .LSmallRemainingStoreAll: | |
136 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr ite) | |
137 ret | |
138 | |
139 // Handle really small blits (0-3 pixels) | |
140 // ************************************** | |
141 .LReallySmall: | |
142 addl $4, %ecx | |
143 jle .LReallySmallExit | |
144 pcmpeqd %xmm1, %xmm1 | |
145 cmpl $2, %ecx // Check how many pixels should be read | |
146 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel | |
147 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel | |
148 jb .LReallySmallCalc | |
149 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel | |
150 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel | |
151 je .LReallySmallCalc | |
152 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel | |
153 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel | |
154 | |
155 .LReallySmallCalc: | |
156 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
157 jc .LReallySmallStore // If all alphas are opaque, just store | |
158 | |
159 // Handle mixed alphas (calculate and scale) | |
160 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
161 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
162 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
163 movdqa %xmm6, %xmm4 | |
164 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
165 movdqa %xmm5, %xmm3 | |
166 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
167 | |
168 pand %xmm0, %xmm5 // Filter out red and blue components | |
169 pmullw %xmm4, %xmm5 // Scale red and blue | |
170 psrlw $8, %xmm3 // Filter out alpha and green components | |
171 pmullw %xmm4, %xmm3 // Scale alpha and green | |
172 | |
173 psrlw $8, %xmm5 // Combine results | |
174 pblendvb %xmm0, %xmm5, %xmm3 | |
175 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
176 | |
177 .LReallySmallStore: | |
178 cmpl $2, %ecx // Check how many pixels should be writt en | |
179 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel | |
180 jb .LReallySmallExit | |
181 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel | |
182 je .LReallySmallExit | |
183 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel | |
184 .LReallySmallExit: | |
185 ret | |
186 | |
187 // Handle bigger blit operations (16+ pixels) | |
188 // ****************************************** | |
189 .p2align 4 | |
190 .LBigBlit: | |
191 // Align destination? | |
192 testl $0xF, %edx | |
193 lddqu (%rax), %xmm1 // Pre-load four source pixels | |
194 jz .LAligned | |
195 | |
196 movq %rdx, %rdi // Calculate alignment of destination po inter | |
197 negq %rdi | |
198 andl $0xF, %edi | |
199 | |
200 // Handle 1-3 pixels to align destination | |
201 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
202 jz .LAlignDone // If all alphas are opaque, just skip | |
203 lddqu (%rdx), %xmm5 // Load four destination pixels | |
204 jc .LAlignStore // If all alphas are opaque, just store | |
205 | |
206 // Handle mixed alphas (calculate and scale) | |
207 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
208 psrlw $8, %xmm2 // Discard red and blue | |
209 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
210 movdqa %xmm6, %xmm4 | |
211 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
212 movdqa %xmm5, %xmm3 | |
213 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
214 | |
215 psllw $8, %xmm3 // Filter out red and blue components | |
216 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
217 movdqa %xmm5, %xmm2 | |
218 psrlw $8, %xmm2 // Filter out alpha and green components | |
219 pmullw %xmm4, %xmm2 // Scale alpha and green | |
220 | |
221 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
222 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
223 | |
224 .LAlignStore: | |
225 cmpl $8, %edi // Check how many pixels should be writt en | |
226 jb .LAlignPixelsLeft1 | |
227 ja .LAlignPixelsLeft3 | |
228 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
229 jmp .LAlignStorePixels | |
230 | |
231 .LAlignPixelsLeft1: | |
232 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
233 jmp .LAlignStorePixels | |
234 | |
235 .LAlignPixelsLeft3: | |
236 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
237 | |
238 .LAlignStorePixels: | |
239 movdqu %xmm5, (%rdx) // Store destination pixels | |
240 | |
241 .LAlignDone: | |
242 addq %rdi, %rax // Adjust pointers and pixel count | |
243 addq %rdi, %rdx | |
244 shrq $2, %rdi | |
245 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali gnment) | |
246 subl %edi, %ecx | |
247 | |
248 .LAligned: // Destination is guaranteed to be 16 by te aligned | |
249 xorq %rdi, %rdi // Reset offset to zero | |
250 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup) | |
251 testl $0xF, %eax // Check alignment of source pointer | |
252 jz .LAlignedLoop | |
253 | |
254 // Source not aligned to destination | |
255 // ********************************* | |
256 .p2align 4 | |
257 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration | |
258 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
259 ja .LAlphaNotOpaqueOrZero00 | |
260 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
261 jz .LAlphaZero00 | |
262 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
263 | |
264 .LAlphaZero00: | |
265 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
266 ja .LAlphaNotOpaqueOrZero01 | |
267 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
268 jz .LAlphaZero01 | |
269 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
270 | |
271 .LAlphaZero01: | |
272 addq $32, %rdi // Adjust offset and pixel count | |
273 subl $8, %ecx | |
274 jae .LUnalignedLoop | |
275 addl $8, %ecx // Adjust pixel count | |
276 jmp .LLoopCleanup0 | |
277 | |
278 .p2align 4 | |
279 .LAlphaNotOpaqueOrZero00: | |
280 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
281 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
282 psrlw $8, %xmm2 // Discard red and blue | |
283 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
284 movdqa %xmm6, %xmm4 | |
285 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
286 movdqa %xmm5, %xmm3 | |
287 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
288 | |
289 psllw $8, %xmm5 // Filter out red and blue components | |
290 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
291 psrlw $8, %xmm3 // Filter out alpha and green components | |
292 pmullw %xmm4, %xmm3 // Scale alpha and green | |
293 | |
294 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
295 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
296 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
297 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
298 | |
299 // Handle next four pixels | |
300 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
301 ja .LAlphaNotOpaqueOrZero01 | |
302 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
303 jz .LAlphaZero02 | |
304 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
305 .LAlphaZero02: | |
306 addq $32, %rdi // Adjust offset and pixel count | |
307 subl $8, %ecx | |
308 jae .LUnalignedLoop | |
309 addl $8, %ecx // Adjust pixel count | |
310 jmp .LLoopCleanup0 | |
311 | |
312 .p2align 4 | |
313 .LAlphaNotOpaqueOrZero01: | |
314 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
315 | |
316 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
317 psrlw $8, %xmm1 // Discard red and blue | |
318 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
319 movdqa %xmm6, %xmm4 | |
320 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
321 movdqa %xmm5, %xmm3 | |
322 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
323 | |
324 psllw $8, %xmm5 // Filter out red and blue components | |
325 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
326 psrlw $8, %xmm3 // Filter out alpha and green components | |
327 pmullw %xmm4, %xmm3 // Scale alpha and green | |
328 | |
329 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
330 addq $32, %rdi | |
331 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
332 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
333 subl $8, %ecx | |
334 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
335 jae .LUnalignedLoop | |
336 addl $8, %ecx // Adjust pixel count | |
337 | |
338 // Cleanup - handle pending pixels from loop | |
339 .LLoopCleanup0: | |
340 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
341 ja .LAlphaNotOpaqueOrZero02 | |
342 jz .LAlphaZero03 | |
343 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
344 .LAlphaZero03: | |
345 addq $16, %rdi | |
346 subl $4, %ecx | |
347 js .LSmallRemaining // Reuse code from small loop | |
348 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels | |
349 jmp .LLoopCleanup0 | |
350 | |
351 .LAlphaNotOpaqueOrZero02: | |
352 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
353 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
354 psrlw $8, %xmm2 // Discard red and blue | |
355 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
356 movdqa %xmm6, %xmm4 | |
357 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
358 movdqa %xmm5, %xmm3 | |
359 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
360 | |
361 psllw $8, %xmm5 // Filter out red and blue components | |
362 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
363 psrlw $8, %xmm3 // Filter out alpha and green components | |
364 pmullw %xmm4, %xmm3 // Scale alpha and green | |
365 | |
366 addq $16, %rdi | |
367 subl $4, %ecx | |
368 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
369 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
370 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
371 js .LSmallRemaining // Reuse code from small loop | |
372 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels | |
373 jmp .LLoopCleanup0 | |
374 | |
375 // Source aligned to destination | |
376 // ***************************** | |
377 .p2align 4 | |
378 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration | |
379 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
380 ja .LAlphaNotOpaqueOrZero10 | |
381 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
382 jz .LAlphaZero10 | |
383 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
384 | |
385 .LAlphaZero10: | |
386 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
387 ja .LAlphaNotOpaqueOrZero11 | |
388 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
389 jz .LAlphaZero11 | |
390 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
391 | |
392 .LAlphaZero11: | |
393 addq $32, %rdi // Adjust offset and pixel count | |
394 subl $8, %ecx | |
395 jae .LAlignedLoop | |
396 jmp .LLoopCleanup1 | |
397 | |
398 .p2align 4 | |
399 .LAlphaNotOpaqueOrZero10: | |
400 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
401 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
402 psrlw $8, %xmm2 // Discard red and blue | |
403 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
404 movdqa %xmm6, %xmm4 | |
405 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
406 movdqa %xmm5, %xmm3 | |
407 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
408 | |
409 psllw $8, %xmm5 // Filter out red and blue components | |
410 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
411 psrlw $8, %xmm3 // Filter out alpha and green components | |
412 pmullw %xmm4, %xmm3 // Scale alpha and green | |
413 | |
414 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
415 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
416 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
417 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
418 | |
419 // Handle next four pixels | |
420 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
421 ja .LAlphaNotOpaqueOrZero11 | |
422 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
423 jz .LAlphaZero12 | |
424 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
425 .LAlphaZero12: | |
426 addq $32, %rdi // Adjust offset and pixel count | |
427 subl $8, %ecx | |
428 jae .LAlignedLoop | |
429 jmp .LLoopCleanup1 | |
430 | |
431 .p2align 4 | |
432 .LAlphaNotOpaqueOrZero11: | |
433 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
434 | |
435 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
436 psrlw $8, %xmm1 // Discard red and blue | |
437 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
438 movdqa %xmm6, %xmm4 | |
439 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
440 movdqa %xmm5, %xmm3 | |
441 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
442 | |
443 psllw $8, %xmm5 // Filter out red and blue components | |
444 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
445 psrlw $8, %xmm3 // Filter out alpha and green components | |
446 pmullw %xmm4, %xmm3 // Scale alpha and green | |
447 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
448 | |
449 addq $32, %rdi | |
450 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
451 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
452 subl $8, %ecx | |
453 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
454 jae .LAlignedLoop | |
455 | |
456 // Cleanup - handle four pending pixels from loop | |
457 .LLoopCleanup1: | |
458 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
459 ja .LAlphaNotOpaqueOrZero12 | |
460 jz .LAlphaZero13 | |
461 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
462 .LAlphaZero13: | |
463 addl $8, %ecx // Adjust offset and pixel count | |
464 jz .LExit | |
465 addq $16, %rdi | |
466 jmp .LRemainLoop1 | |
467 | |
468 .LAlphaNotOpaqueOrZero12: | |
469 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
470 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
471 psrlw $8, %xmm2 // Discard red and blue | |
472 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
473 movdqa %xmm6, %xmm4 | |
474 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
475 movdqa %xmm5, %xmm3 | |
476 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
477 | |
478 psllw $8, %xmm5 // Filter out red and blue components | |
479 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
480 psrlw $8, %xmm3 // Filter out alpha and green components | |
481 pmullw %xmm4, %xmm3 // Scale alpha and green | |
482 | |
483 addl $8, %ecx // Adjust offset and pixel count | |
484 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
485 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
486 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
487 jz .LExit | |
488 addq $16, %rdi | |
489 | |
490 // Handle last 1-7 pixels | |
491 .LRemainLoop1: | |
492 movdqa (%rax, %rdi), %xmm1 // Load four source pixels | |
493 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
494 ja .LRemainAlphaNotOpaqueOrZero1 | |
495 jz .LRemainAlphaZero1 | |
496 | |
497 // All alphas were opaque (copy) | |
498 subl $4, %ecx // Check if we have more than four pixel s left | |
499 jle .LRemainStore | |
500 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
501 addq $16, %rdi | |
502 jmp .LRemainLoop1 | |
503 | |
504 // All alphas were zero (skip) | |
505 .p2align 4 | |
506 .LRemainAlphaZero1: | |
507 subl $4, %ecx // Check if we have more than four pixel s left | |
508 jle .LExit | |
509 addq $16, %rdi | |
510 jmp .LRemainLoop1 | |
511 | |
512 // Handle mixed alphas (calculate and scale) | |
513 .p2align 4 | |
514 .LRemainAlphaNotOpaqueOrZero1: | |
515 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
516 | |
517 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
518 psrlw $8, %xmm2 // Discard red and blue | |
519 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
520 movdqa %xmm6, %xmm4 | |
521 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
522 movdqa %xmm5, %xmm3 | |
523 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
524 | |
525 psllw $8, %xmm5 // Filter out red and blue components | |
526 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
527 psrlw $8, %xmm3 // Filter out alpha and green components | |
528 pmullw %xmm4, %xmm3 // Scale alpha and green | |
529 | |
530 subl $4, %ecx | |
531 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
532 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
533 jle .LRemainStore | |
534 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
535 addq $16, %rdi | |
536 jmp .LRemainLoop1 | |
537 | |
538 // Store the last 1-4 pixels | |
539 .p2align 4 | |
540 .LRemainStore: | |
541 jz .LRemainFull | |
542 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
543 cmpl $-2, %ecx // Check how many pixels should be writt en | |
544 jb .LRemainPixelsLeft11 | |
545 ja .LRemainPixelsLeft13 | |
546 pblendw $0x0F, %xmm1, %xmm5 | |
547 movdqa %xmm5, (%rdx, %rdi) // Store last 2 destination pixels | |
548 .LExit: | |
549 ret | |
550 | |
551 .LRemainPixelsLeft11: | |
552 pblendw $0x03, %xmm1, %xmm5 | |
553 movdqa %xmm5, (%rdx, %rdi) // Store last destination pixel | |
554 ret | |
555 | |
556 .LRemainPixelsLeft13: | |
557 pblendw $0x3F, %xmm1, %xmm5 | |
558 movdqa %xmm5, (%rdx, %rdi) // Store last 3 destination pixels | |
559 ret | |
560 | |
561 .LRemainFull: | |
562 movdqa %xmm1, (%rdx, %rdi) // Store last 4 destination pixels | |
563 ret | |
564 | |
565 .cfi_endproc | |
566 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
567 | |
568 // Constants for SSE code | |
569 .pushsection .rodata.sse4,"a",@progbits | |
570 .p2align 4 | |
571 .LAlphaCheckMask: | |
mtklein
2014/05/16 18:06:38
Looks like the differences here are:
1) calling
henrik.smiding
2014/05/20 15:10:29
I tested doing a position independent version in 3
| |
572 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 | |
573 .LInverseAlphaCalc: | |
574 .word 256, 256, 256, 256, 256, 256, 256, 256 | |
575 .LResultMergeMask: | |
576 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF | |
577 .popsection | |
578 #endif | |
OLD | NEW |