| OLD | NEW | 
| (Empty) |  | 
 |    1 /* | 
 |    2  * Copyright 2014 Google Inc. | 
 |    3  * | 
 |    4  * Use of this source code is governed by a BSD-style license that can be | 
 |    5  * found in the LICENSE file. | 
 |    6  */ | 
 |    7  | 
 |    8 #include "SkBenchmark.h" | 
 |    9 #include "SkRandom.h" | 
 |   10 #include "SkTemplates.h" | 
 |   11  | 
 |   12 template <typename Memcpy32> | 
 |   13 class Memcpy32Bench : public SkBenchmark { | 
 |   14 public: | 
 |   15     explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name) | 
 |   16         : fCount(count) | 
 |   17         , fMemcpy32(memcpy32) | 
 |   18         , fName(SkStringPrintf("%s_%d", name, count)) {} | 
 |   19  | 
 |   20     virtual const char* onGetName() SK_OVERRIDE { | 
 |   21         return fName.c_str(); | 
 |   22     } | 
 |   23  | 
 |   24     virtual bool isSuitableFor(Backend backend) SK_OVERRIDE { | 
 |   25         return backend == kNonRendering_Backend; | 
 |   26     } | 
 |   27  | 
 |   28     virtual void onPreDraw() SK_OVERRIDE { | 
 |   29         fDst.reset(fCount); | 
 |   30         fSrc.reset(fCount); | 
 |   31  | 
 |   32         SkRandom rand; | 
 |   33         for (int i = 0; i < fCount; i++) { | 
 |   34             fSrc[i] = rand.nextU(); | 
 |   35         } | 
 |   36     } | 
 |   37  | 
 |   38     virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { | 
 |   39         for (int i = 0; i < loops; i++) { | 
 |   40             fMemcpy32(fDst, fSrc, fCount); | 
 |   41         } | 
 |   42     } | 
 |   43  | 
 |   44 private: | 
 |   45     SkAutoTMalloc<uint32_t> fDst, fSrc; | 
 |   46  | 
 |   47     int fCount; | 
 |   48     Memcpy32 fMemcpy32; | 
 |   49     const SkString fName; | 
 |   50 }; | 
 |   51  | 
 |   52 template <typename Memcpy32> | 
 |   53 static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* 
     name) { | 
 |   54     return new Memcpy32Bench<Memcpy32>(count, memcpy32, name); | 
 |   55 } | 
 |   56 #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32
     ); ) | 
 |   57  | 
 |   58  | 
 |   59 // Let the libc developers do what they think is best. | 
 |   60 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { | 
 |   61     memcpy(dst, src, sizeof(uint32_t) * count); | 
 |   62 } | 
 |   63 BENCH(memcpy32_memcpy, 10) | 
 |   64 BENCH(memcpy32_memcpy, 100) | 
 |   65 BENCH(memcpy32_memcpy, 1000) | 
 |   66 BENCH(memcpy32_memcpy, 10000) | 
 |   67 BENCH(memcpy32_memcpy, 100000) | 
 |   68  | 
 |   69 // Let the compiler's autovectorizer do what it thinks is best. | 
 |   70 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count
     ) { | 
 |   71     while (count --> 0) { | 
 |   72         *dst++ = *src++; | 
 |   73     } | 
 |   74 } | 
 |   75 BENCH(memcpy32_autovectorize, 10) | 
 |   76 BENCH(memcpy32_autovectorize, 100) | 
 |   77 BENCH(memcpy32_autovectorize, 1000) | 
 |   78 BENCH(memcpy32_autovectorize, 10000) | 
 |   79 BENCH(memcpy32_autovectorize, 100000) | 
 |   80  | 
 |   81 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 
 |   82  | 
 |   83 // Align dst to 16 bytes, then use aligned stores.  src isn't algined, so use un
     aligned loads. | 
 |   84 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { | 
 |   85     if (count >= 16) { | 
 |   86         while (uintptr_t(dst) & 0xF) { | 
 |   87             *dst++ = *src++; | 
 |   88             count--; | 
 |   89         } | 
 |   90  | 
 |   91         __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 
 |   92         const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 
 |   93         while (count >= 16) { | 
 |   94             __m128i a = _mm_loadu_si128(src128++); | 
 |   95             __m128i b = _mm_loadu_si128(src128++); | 
 |   96             __m128i c = _mm_loadu_si128(src128++); | 
 |   97             __m128i d = _mm_loadu_si128(src128++); | 
 |   98  | 
 |   99             _mm_store_si128(dst128++, a); | 
 |  100             _mm_store_si128(dst128++, b); | 
 |  101             _mm_store_si128(dst128++, c); | 
 |  102             _mm_store_si128(dst128++, d); | 
 |  103  | 
 |  104             count -= 16; | 
 |  105         } | 
 |  106  | 
 |  107         dst = reinterpret_cast<uint32_t*>(dst128); | 
 |  108         src = reinterpret_cast<const uint32_t*>(src128); | 
 |  109     } | 
 |  110  | 
 |  111     while (count --> 0) { | 
 |  112         *dst++ = *src++; | 
 |  113     } | 
 |  114 } | 
 |  115 BENCH(memcpy32_sse2_align, 10) | 
 |  116 BENCH(memcpy32_sse2_align, 100) | 
 |  117 BENCH(memcpy32_sse2_align, 1000) | 
 |  118 BENCH(memcpy32_sse2_align, 10000) | 
 |  119 BENCH(memcpy32_sse2_align, 100000) | 
 |  120  | 
 |  121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unal
     igned loads for src. | 
 |  122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count)
      { | 
 |  123     __m128i* dst128 = reinterpret_cast<__m128i*>(dst); | 
 |  124     const __m128i* src128 = reinterpret_cast<const __m128i*>(src); | 
 |  125     while (count >= 16) { | 
 |  126         __m128i a = _mm_loadu_si128(src128++); | 
 |  127         __m128i b = _mm_loadu_si128(src128++); | 
 |  128         __m128i c = _mm_loadu_si128(src128++); | 
 |  129         __m128i d = _mm_loadu_si128(src128++); | 
 |  130  | 
 |  131         _mm_storeu_si128(dst128++, a); | 
 |  132         _mm_storeu_si128(dst128++, b); | 
 |  133         _mm_storeu_si128(dst128++, c); | 
 |  134         _mm_storeu_si128(dst128++, d); | 
 |  135  | 
 |  136         count -= 16; | 
 |  137     } | 
 |  138  | 
 |  139     dst = reinterpret_cast<uint32_t*>(dst128); | 
 |  140     src = reinterpret_cast<const uint32_t*>(src128); | 
 |  141     while (count --> 0) { | 
 |  142         *dst++ = *src++; | 
 |  143     } | 
 |  144 } | 
 |  145 BENCH(memcpy32_sse2_unalign, 10) | 
 |  146 BENCH(memcpy32_sse2_unalign, 100) | 
 |  147 BENCH(memcpy32_sse2_unalign, 1000) | 
 |  148 BENCH(memcpy32_sse2_unalign, 10000) | 
 |  149 BENCH(memcpy32_sse2_unalign, 100000) | 
 |  150  | 
 |  151 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 
 |  152  | 
 |  153 #undef BENCH | 
 |  154  | 
| OLD | NEW |