Commit 4bf1ce27 authored by stephana's avatar stephana Committed by Commit bot
Browse files

Revert of Revert of SSE4 opaque blend using intrinsics instead of assembly....

Revert of Revert of SSE4 opaque blend using intrinsics instead of assembly. (patchset #1 id:1 of https://codereview.chromium.org/873553003/)

Reason for revert:
Reverted the wrong CL.

Original issue's description:
> Revert of SSE4 opaque blend using intrinsics instead of assembly. (patchset #16 id:300001 of https://codereview.chromium.org/874863002/)
>
> Reason for revert:
> This causes a bug on the 'hittestpath' GM on MacMini 4,1
>
> See:
>
> https://gold.skia.org/#/triage/hittestpath?head=0
>
> for details.
>
> Original issue's description:
> > SSE4 opaque blend using intrinsics instead of assembly.
> >
> > Since we had such a hard time with the assembly versions of this blit (to the
> > point that we have them completely disabled everywhere), I thought I'd take
> > a shot at writing a version of the blit using intrinsics.
> >
> > The key feature of SSE4 we're exploiting is that we can use ptest (_mm_test*)
> > to skip the blend when the 16 src pixels we consider each loop are all opaque
> > or all transparent.  _mm_shuffle_epi8 from SSSE3 also lends a hand to extract
> > all those alphas.
> >
> > It's worth looking to see if we can backport this type of logic to SSE2 using
> > _mm_movemask_epi8, or up to 32 pixels at a time using AVX.
> >
> > My local performance testing doesn't show this to be an unambiguous win
> > (there are probably microbenchmarks and SKPs where we'd be better off just
> > powering through the blend rather than looking at alphas), but the potential
> > does seem tantalizing enough to let skiaperf vet it on the bots.  (< 1.0x is a win.)
> >
> > DM says it draws pixel perfect compare to the old code.
> >
> > Microbenchmarks:
> >                bitmap_RGBA_8888_A_source_stripes_two	  14us -> 14.4us	1.03x
> >              bitmap_RGBA_8888_A_source_stripes_three	14.3us -> 14.5us	1.01x
> >                        bitmap_RGBA_8888_scale_bilerp	61.9us -> 62.2us	1.01x
> > bitmap_RGBA_8888_update_volatile_scale_rotate_bilerp	 102us ->  101us	0.99x
> >                 bitmap_RGBA_8888_scale_rotate_bilerp	 103us ->  101us	0.99x
> >                               bitmap_RGBA_8888_scale	18.4us -> 18.2us	0.99x
> >              bitmap_RGBA_8888_A_scale_rotate_bicubic	  71us ->   70us	0.99x
> >          bitmap_RGBA_8888_update_scale_rotate_bilerp	 103us ->  101us	0.99x
> >               bitmap_RGBA_8888_A_scale_rotate_bilerp	 112us ->  109us	0.98x
> >                     bitmap_RGBA_8888_update_volatile	5.72us -> 5.58us	0.98x
> >                                     bitmap_RGBA_8888	5.73us -> 5.58us	0.97x
> >                              bitmap_RGBA_8888_update	5.78us ->  5.6us	0.97x
> >                      bitmap_RGBA_8888_A_scale_bilerp	70.7us ->   68us	0.96x
> >                     bitmap_RGBA_8888_A_scale_bicubic	23.7us -> 21.8us	0.92x
> >                                   bitmap_RGBA_8888_A	13.9us -> 10.9us	0.78x
> >                     bitmap_RGBA_8888_A_source_opaque	  14us -> 6.29us	0.45x
> >                bitmap_RGBA_8888_A_source_transparent	  14us -> 3.65us	0.26x
> >
> > Running over our ~70 SKP web page captures, this looks like we spend 0.7x
> > the time in S32A_Opaque_BlitRow compared to the SSE2 version, which should
> > be a decent predictor of real-world impact.
> >
> > BUG=chromium:399842
> >
> > Committed: https://skia.googlesource.com/skia/+/04bc91b972417038fecfa87c484771eac2b9b785
> >
> > CQ_EXTRA_TRYBOTS=client.skia:Test-Mac10.6-MacMini4.1-GeForce320M-x86_64-Release-Trybot
> >
> > Committed: https://skia.googlesource.com/skia/+/6dbfb21a6c88af6d94e8c823c3ad559f1a41b493
>
> TBR=henrik.smiding@intel.com,mtklein@google.com,herb@google.com,reed@google.com,thakis@chromium.org,mtklein@chromium.org
> NOPRESUBMIT=true
> NOTREECHECKS=true
> NOTRY=true
> BUG=chromium:399842
>
> Committed: https://skia.googlesource.com/skia/+/4988891a1173cd405bf1c1dd3a3668c451f45e4c

TBR=henrik.smiding@intel.com,mtklein@google.com,herb@google.com,reed@google.com,thakis@chromium.org,mtklein@chromium.org
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=chromium:399842

Review URL: https://codereview.chromium.org/894083002
parent 4988891a
......@@ -79,5 +79,6 @@
],
'sse41_sources': [
'<(skia_src_path)/opts/SkBlurImage_opts_SSE4.cpp',
'<(skia_src_path)/opts/SkBlitRow_opts_SSE4.cpp',
],
}
#include "SkBlitRow_opts_SSE4.h"
// Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub methods.
// The stubs should never be called, so we make them crash just to confirm that.
#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
sk_throw();
}
#else
#include <emmintrin.h> // SSE2: Most _mm_foo() in this file.
#include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.
#include "SkColorPriv.h"
#include "SkColor_opts_SSE2.h"
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count,
U8CPU alpha) {
SkASSERT(alpha == 255);
// As long as we can, we'll work on 16 pixel pairs at once.
int count16 = count / 16;
__m128i* dst4 = (__m128i*)dst;
const __m128i* src4 = (const __m128i*)src;
for (int i = 0; i < count16 * 4; i += 4) {
// Load 16 source pixels.
__m128i s0 = _mm_loadu_si128(src4+i+0),
s1 = _mm_loadu_si128(src4+i+1),
s2 = _mm_loadu_si128(src4+i+2),
s3 = _mm_loadu_si128(src4+i+3);
const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
if (_mm_testz_si128(ORed, alphaMask)) {
// All 16 source pixels are fully transparent. There's nothing to do!
continue;
}
const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
if (_mm_testc_si128(ANDed, alphaMask)) {
// All 16 source pixels are fully opaque. There's no need to read dst or blend it.
_mm_storeu_si128(dst4+i+0, s0);
_mm_storeu_si128(dst4+i+1, s1);
_mm_storeu_si128(dst4+i+2, s2);
_mm_storeu_si128(dst4+i+3, s3);
continue;
}
// The general slow case: do the blend for all 16 pixels.
_mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
_mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
_mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
_mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
}
// Wrap up the last <= 15 pixels.
for (int i = count16*16; i < count; i++) {
// This check is not really necessarily, but it prevents pointless autovectorization.
if (src[i] & 0xFF000000) {
dst[i] = SkPMSrcOver(src[i], dst[i]);
}
}
}
#endif
......@@ -10,24 +10,9 @@
#include "SkBlitRow.h"
#ifdef CRBUG_399842_FIXED
/* Check if we are able to build assembly code, GCC/AT&T syntax:
* 1) Clang and GCC are generally OK. OS X's old LLVM-GCC 4.2 can't handle it;
* 2) We're intentionally not linking this in even when supported (Clang) on Windows;
* 3) MemorySanitizer cannot instrument assembly at all.
*/
#if /* 1)*/ (defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))) \
/* 2)*/ && !defined(SK_BUILD_FOR_WIN) \
/* 3)*/ && !defined(MEMORY_SANITIZER)
extern "C" void S32A_Opaque_BlitRow32_SSE4_asm(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha);
#define SK_ATT_ASM_SUPPORTED
#endif
#endif // CRBUG_399842_FIXED
void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT,
const SkPMColor* SK_RESTRICT,
int count,
U8CPU alpha);
#endif
This diff is collapsed.
This diff is collapsed.
......@@ -206,7 +206,14 @@ static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
return d_pixel;
}
// Portable version SkBlendARGB32 is in SkColorPriv.h.
// Portable version is SkPMSrcOver in SkColorPriv.h.
static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
return _mm_add_epi32(src,
SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
SkGetPackedA32_SSE2(src))));
}
// Portable version is SkBlendARGB32 in SkColorPriv.h.
static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
const __m128i& aa) {
__m128i src_scale = SkAlpha255To256_SSE2(aa);
......
......@@ -227,21 +227,17 @@ static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
#if defined(SK_ATT_ASM_SUPPORTED)
static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
NULL, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,
S32A_Opaque_BlitRow32_SSE4_asm, // S32A_Opaque
S32A_Opaque_BlitRow32_SSE4, // S32A_Opaque
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
#endif
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
#if defined(SK_ATT_ASM_SUPPORTED)
if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
return platform_32_procs_SSE4[flags];
} else
#endif
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
return platform_32_procs_SSE2[flags];
} else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment