Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(67)

Side by Side Diff: source/row_msa.cc

Issue 2636483002: Add MSA optimized NV12/21 To RGB row functions (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 2005 matching lines...) Expand 10 before | Expand all | Expand 10 after
2016 res1 = __msa_copy_u_d((v2i64)dst0, 1); 2016 res1 = __msa_copy_u_d((v2i64)dst0, 1);
2017 SD(res0, dst_u); 2017 SD(res0, dst_u);
2018 SD(res1, dst_v); 2018 SD(res1, dst_v);
2019 t += 48; 2019 t += 48;
2020 s += 48; 2020 s += 48;
2021 dst_u += 8; 2021 dst_u += 8;
2022 dst_v += 8; 2022 dst_v += 8;
2023 } 2023 }
2024 } 2024 }
2025 2025
2026 void NV12ToARGBRow_MSA(const uint8* src_y,
2027 const uint8* src_uv,
2028 uint8* rgb_buf,
2029 const struct YuvConstants* yuvconstants,
2030 int width) {
2031 int x;
2032 uint64 val0, val1;
2033 v16u8 src0, src1, dst0, dst1;
2034 v8u16 vec0, vec1, vec2, vec3;
2035 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2036 v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
2037 v16u8 zero = {0};
2038 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
2039 v4i32 max = __msa_ldi_w(255);
2040
2041 vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
2042 vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
2043 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2044 vec_ugvg =
2045 __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16));
2046 vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
2047 vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
2048 vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
2049 vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
2050
2051 for (x = 0; x < width; x += 8) {
2052 val0 = LD(src_y);
2053 val1 = LD(src_uv);
2054 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2055 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2056 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2057 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2058 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2059 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2060 reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2061 reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
fbarchard1 2017/01/13 18:46:01 consider macros to read nv12, do yuvtorgb, and wri
manojkumar.bhosale 2017/01/17 10:29:45 Done.
2062 reg0 *= vec_yg;
2063 reg1 *= vec_yg;
2064 reg2 *= vec_ubvr;
2065 reg3 *= vec_ubvr;
2066 reg0 = __msa_srai_w(reg0, 16);
2067 reg1 = __msa_srai_w(reg1, 16);
2068 reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
2069 reg5 = __msa_ilvev_w(reg2, reg2);
2070 reg6 = __msa_ilvev_w(reg3, reg3);
2071 reg7 = __msa_ilvr_w(reg4, reg4);
2072 reg2 = __msa_ilvod_w(reg2, reg2);
2073 reg3 = __msa_ilvod_w(reg3, reg3);
2074 reg4 = __msa_ilvl_w(reg4, reg4);
2075 reg5 = reg0 - reg5;
2076 reg6 = reg1 - reg6;
2077 reg2 = reg0 - reg2;
2078 reg3 = reg1 - reg3;
2079 reg7 = reg0 - reg7;
2080 reg4 = reg1 - reg4;
2081 reg5 += vec_bb;
2082 reg6 += vec_bb;
2083 reg7 += vec_bg;
2084 reg4 += vec_bg;
2085 reg2 += vec_br;
2086 reg3 += vec_br;
2087 reg5 = __msa_srai_w(reg5, 6);
2088 reg6 = __msa_srai_w(reg6, 6);
2089 reg7 = __msa_srai_w(reg7, 6);
2090 reg4 = __msa_srai_w(reg4, 6);
2091 reg2 = __msa_srai_w(reg2, 6);
2092 reg3 = __msa_srai_w(reg3, 6);
2093 reg5 = __msa_maxi_s_w(reg5, 0);
fbarchard1 2017/01/13 18:46:01 no free way to pack words to bytes with saturation
manojkumar.bhosale 2017/01/17 10:29:45 No
2094 reg6 = __msa_maxi_s_w(reg6, 0);
2095 reg7 = __msa_maxi_s_w(reg7, 0);
2096 reg4 = __msa_maxi_s_w(reg4, 0);
2097 reg2 = __msa_maxi_s_w(reg2, 0);
2098 reg3 = __msa_maxi_s_w(reg3, 0);
2099 reg5 = __msa_min_s_w(max, reg5);
2100 reg6 = __msa_min_s_w(max, reg6);
2101 reg7 = __msa_min_s_w(max, reg7);
2102 reg4 = __msa_min_s_w(max, reg4);
2103 reg2 = __msa_min_s_w(max, reg2);
2104 reg3 = __msa_min_s_w(max, reg3);
2105 vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5);
2106 vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2);
2107 vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6);
2108 vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3);
2109 dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0);
2110 dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2);
2111 ST_UB2(dst0, dst1, rgb_buf, 16);
2112 src_y += 8;
2113 src_uv += 8;
2114 rgb_buf += 32;
2115 }
2116 }
2117
2118 void NV12ToRGB565Row_MSA(const uint8* src_y,
2119 const uint8* src_uv,
2120 uint8* rgb_buf,
2121 const struct YuvConstants* yuvconstants,
2122 int width) {
2123 int x;
2124 uint64 val0, val1;
2125 v16u8 src0, src1, dst0;
2126 v8u16 vec0, vec1, vec2;
2127 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2128 v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
2129 v16u8 zero = {0};
2130 v4i32 max = __msa_ldi_w(255);
2131
2132 vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
2133 vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
2134 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2135 vec_ugvg =
2136 __msa_fill_w((yuvconstants->kUVToG[0]) | (yuvconstants->kUVToG[1] << 16));
2137 vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
2138 vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
2139 vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
2140 vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
2141
2142 for (x = 0; x < width; x += 8) {
2143 val0 = LD(src_y);
2144 val1 = LD(src_uv);
2145 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2146 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2147 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2148 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2149 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2150 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2151 reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2152 reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2153 reg0 *= vec_yg;
2154 reg1 *= vec_yg;
2155 reg2 *= vec_ubvr;
2156 reg3 *= vec_ubvr;
2157 reg0 = __msa_srai_w(reg0, 16);
2158 reg1 = __msa_srai_w(reg1, 16);
2159 reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
2160 reg5 = __msa_ilvev_w(reg2, reg2);
2161 reg6 = __msa_ilvev_w(reg3, reg3);
2162 reg7 = __msa_ilvr_w(reg4, reg4);
2163 reg2 = __msa_ilvod_w(reg2, reg2);
2164 reg3 = __msa_ilvod_w(reg3, reg3);
2165 reg4 = __msa_ilvl_w(reg4, reg4);
2166 reg5 = reg0 - reg5;
2167 reg6 = reg1 - reg6;
2168 reg2 = reg0 - reg2;
2169 reg3 = reg1 - reg3;
2170 reg7 = reg0 - reg7;
2171 reg4 = reg1 - reg4;
2172 reg5 += vec_bb;
2173 reg6 += vec_bb;
2174 reg7 += vec_bg;
2175 reg4 += vec_bg;
2176 reg2 += vec_br;
2177 reg3 += vec_br;
2178 reg5 = __msa_srai_w(reg5, 6);
2179 reg6 = __msa_srai_w(reg6, 6);
2180 reg7 = __msa_srai_w(reg7, 6);
2181 reg4 = __msa_srai_w(reg4, 6);
2182 reg2 = __msa_srai_w(reg2, 6);
2183 reg3 = __msa_srai_w(reg3, 6);
2184 reg5 = __msa_maxi_s_w(reg5, 0);
2185 reg6 = __msa_maxi_s_w(reg6, 0);
2186 reg7 = __msa_maxi_s_w(reg7, 0);
2187 reg4 = __msa_maxi_s_w(reg4, 0);
2188 reg2 = __msa_maxi_s_w(reg2, 0);
2189 reg3 = __msa_maxi_s_w(reg3, 0);
2190 reg5 = __msa_min_s_w(max, reg5);
2191 reg6 = __msa_min_s_w(max, reg6);
2192 reg7 = __msa_min_s_w(max, reg7);
2193 reg4 = __msa_min_s_w(max, reg4);
2194 reg2 = __msa_min_s_w(max, reg2);
2195 reg3 = __msa_min_s_w(max, reg3);
2196 vec0 = (v8u16)__msa_pckev_h((v8i16)reg6, (v8i16)reg5);
2197 vec1 = (v8u16)__msa_pckev_h((v8i16)reg4, (v8i16)reg7);
2198 vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2199 vec0 = vec0 >> 3;
2200 vec1 = (vec1 >> 2) << 5;
2201 vec2 = (vec2 >> 3) << 11;
2202 dst0 = (v16u8)(vec0 | vec1 | vec2);
2203 ST_UB(dst0, rgb_buf);
2204 src_y += 8;
2205 src_uv += 8;
2206 rgb_buf += 16;
2207 }
2208 }
2209
2210 void NV21ToARGBRow_MSA(const uint8* src_y,
2211 const uint8* src_vu,
2212 uint8* rgb_buf,
2213 const struct YuvConstants* yuvconstants,
2214 int width) {
2215 int x;
2216 uint64 val0, val1;
2217 v16u8 src0, src1, dst0, dst1;
2218 v8u16 vec0, vec1, vec2, vec3;
2219 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2220 v4i32 vec_ub, vec_vr, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg;
2221 v16u8 zero = {0};
2222 v16u8 const_255 = (v16u8)__msa_ldi_b(255);
2223 v4i32 max = __msa_ldi_w(255);
2224
2225 vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]);
2226 vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]);
2227 vec_ubvr = __msa_ilvr_w(vec_ub, vec_vr);
2228 vec_ugvg =
2229 __msa_fill_w((yuvconstants->kUVToG[1] | (yuvconstants->kUVToG[0] << 16)));
2230 vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]);
2231 vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]);
2232 vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]);
2233 vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]);
2234
2235 for (x = 0; x < width; x += 8) {
2236 val0 = LD(src_y);
2237 val1 = LD(src_vu);
2238 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2239 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2240 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2241 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2242 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2243 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2244 reg2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2245 reg3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2246 reg0 *= vec_yg;
2247 reg1 *= vec_yg;
2248 reg2 *= vec_ubvr;
2249 reg3 *= vec_ubvr;
2250 reg0 = __msa_srai_w(reg0, 16);
2251 reg1 = __msa_srai_w(reg1, 16);
2252 reg4 = __msa_dotp_s_w((v8i16)vec1, (v8i16)vec_ugvg);
2253 reg5 = __msa_ilvod_w(reg2, reg2);
2254 reg6 = __msa_ilvod_w(reg3, reg3);
2255 reg7 = __msa_ilvr_w(reg4, reg4);
2256 reg2 = __msa_ilvev_w(reg2, reg2);
2257 reg3 = __msa_ilvev_w(reg3, reg3);
2258 reg4 = __msa_ilvl_w(reg4, reg4);
2259 reg5 = reg0 - reg5;
2260 reg6 = reg1 - reg6;
2261 reg2 = reg0 - reg2;
2262 reg3 = reg1 - reg3;
2263 reg7 = reg0 - reg7;
2264 reg4 = reg1 - reg4;
2265 reg5 += vec_bb;
2266 reg6 += vec_bb;
2267 reg7 += vec_bg;
2268 reg4 += vec_bg;
2269 reg2 += vec_br;
2270 reg3 += vec_br;
2271 reg5 = __msa_srai_w(reg5, 6);
2272 reg6 = __msa_srai_w(reg6, 6);
2273 reg7 = __msa_srai_w(reg7, 6);
2274 reg4 = __msa_srai_w(reg4, 6);
2275 reg2 = __msa_srai_w(reg2, 6);
2276 reg3 = __msa_srai_w(reg3, 6);
2277 reg5 = __msa_maxi_s_w(reg5, 0);
2278 reg6 = __msa_maxi_s_w(reg6, 0);
2279 reg7 = __msa_maxi_s_w(reg7, 0);
2280 reg4 = __msa_maxi_s_w(reg4, 0);
2281 reg2 = __msa_maxi_s_w(reg2, 0);
2282 reg3 = __msa_maxi_s_w(reg3, 0);
2283 reg5 = __msa_min_s_w(max, reg5);
2284 reg6 = __msa_min_s_w(max, reg6);
2285 reg7 = __msa_min_s_w(max, reg7);
2286 reg4 = __msa_min_s_w(max, reg4);
2287 reg2 = __msa_min_s_w(max, reg2);
2288 reg3 = __msa_min_s_w(max, reg3);
2289 vec0 = (v8u16)__msa_ilvev_b((v16i8)reg7, (v16i8)reg5);
2290 vec1 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg2);
2291 vec2 = (v8u16)__msa_ilvev_b((v16i8)reg4, (v16i8)reg6);
2292 vec3 = (v8u16)__msa_ilvev_b((v16i8)const_255, (v16i8)reg3);
2293 dst0 = (v16u8)__msa_ilvev_h((v8i16)vec1, (v8i16)vec0);
2294 dst1 = (v16u8)__msa_ilvev_h((v8i16)vec3, (v8i16)vec2);
2295 ST_UB2(dst0, dst1, rgb_buf, 16);
2296 src_y += 8;
2297 src_vu += 8;
2298 rgb_buf += 32;
2299 }
2300 }
2301
2302 void SobelRow_MSA(const uint8* src_sobelx,
2303 const uint8* src_sobely,
2304 uint8* dst_argb,
2305 int width) {
2306 int x;
2307 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2308 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2309 v16i8 const_0x4 = __msa_ldi_b(0x4);
2310 v16i8 mask1 = mask0 + const_0x4;
2311 v16i8 mask2 = mask1 + const_0x4;
2312 v16i8 mask3 = mask2 + const_0x4;
2313 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
2314
2315 for (x = 0; x < width; x += 16) {
2316 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2317 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2318 vec0 = __msa_adds_u_b(src0, src1);
2319 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0);
2320 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0);
2321 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0);
2322 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0);
2323 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2324 src_sobelx += 16;
2325 src_sobely += 16;
2326 dst_argb += 64;
2327 }
2328 }
2329
2330 void SobelToPlaneRow_MSA(const uint8* src_sobelx,
2331 const uint8* src_sobely,
2332 uint8* dst_y,
2333 int width) {
2334 int x;
2335 v16u8 src0, src1, src2, src3, dst0, dst1;
2336
2337 for (x = 0; x < width; x += 32) {
2338 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2339 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
2340 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2341 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
2342 dst0 = __msa_adds_u_b(src0, src2);
2343 dst1 = __msa_adds_u_b(src1, src3);
2344 ST_UB2(dst0, dst1, dst_y, 16);
2345 src_sobelx += 32;
2346 src_sobely += 32;
2347 dst_y += 32;
2348 }
2349 }
2350
2351 void SobelXYRow_MSA(const uint8* src_sobelx,
2352 const uint8* src_sobely,
2353 uint8* dst_argb,
2354 int width) {
2355 int x;
2356 v16u8 src0, src1, vec0, vec1, vec2;
2357 v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2358 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF);
2359
2360 for (x = 0; x < width; x += 16) {
2361 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2362 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2363 vec0 = __msa_adds_u_b(src0, src1);
2364 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2365 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2366 reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0);
2367 reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0);
2368 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2369 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2370 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2371 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2372 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2373 src_sobelx += 16;
2374 src_sobely += 16;
2375 dst_argb += 64;
2376 }
2377 }
2378
2026 #ifdef __cplusplus 2379 #ifdef __cplusplus
2027 } // extern "C" 2380 } // extern "C"
2028 } // namespace libyuv 2381 } // namespace libyuv
2029 #endif 2382 #endif
2030 2383
2031 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 2384 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698