From 00ddb0029da729c4cb456d5fe39f4a65c8531ca2 Mon Sep 17 00:00:00 2001 From: Adlai Holler Date: Mon, 11 May 2020 19:25:52 -0400 Subject: [PATCH] Reland "Support large kernels on GPU in matrix convolution effect" This reverts commit a117e7b75bd6ae262d52ee61769afd7cb1ff556f. Reason for revert: Fixed divide-by-0 in the unpremul logic. This was here before but never caused problems (or we ignored them.) Original change's description: > Revert "Reland "Support large kernels on GPU in matrix convolution effect"" > > This reverts commit 76cb9c4d4c8b5db4a3c002a4174172e4972a05b6. > > Reason for revert: Tegra3 & Metal issues > > Original change's description: > > Reland "Support large kernels on GPU in matrix convolution effect" > > > > This reverts commit 41e377d1baf0dbf1355ec644a6fe40f6da48fc43. > > > > Reason for revert: fixed issues > > > > Bug: skia:8449 > > Change-Id: I0c4389f0efa92c6da69253b2304ad9a072750965 > > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/287817 > > Commit-Queue: Adlai Holler > > Reviewed-by: Brian Salomon > > TBR=bsalomon@google.com,robertphillips@google.com,michaelludwig@google.com,adlai@google.com > > Change-Id: I5c3f04d4d262550a3298b8fd677c8a1661be7ad9 > No-Presubmit: true > No-Tree-Checks: true > No-Try: true > Bug: skia:8449 > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/289076 > Reviewed-by: Adlai Holler > Commit-Queue: Adlai Holler TBR=bsalomon@google.com,robertphillips@google.com,michaelludwig@google.com,adlai@google.com Bug: skia:8449 Change-Id: I90b8e9e0eb52bc08308fb472eb216ed0bd4785a1 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/289030 Reviewed-by: Brian Salomon Commit-Queue: Adlai Holler --- gm/matrixconvolution.cpp | 3 +- src/core/SkGpuBlurUtils.cpp | 8 +- .../SkMatrixConvolutionImageFilter.cpp | 7 +- src/gpu/GrFragmentProcessor.h | 2 + src/gpu/effects/GrMatrixConvolutionEffect.cpp | 277 ++++++++++++++---- src/gpu/effects/GrMatrixConvolutionEffect.h | 103 ++++++- 6 files changed, 324 insertions(+), 76 deletions(-) diff --git a/gm/matrixconvolution.cpp b/gm/matrixconvolution.cpp index 5897ffb65d..94fc8b311f 100644 --- a/gm/matrixconvolution.cpp +++ b/gm/matrixconvolution.cpp @@ -22,6 +22,7 @@ #include "include/core/SkTypeface.h" #include "include/effects/SkGradientShader.h" #include "include/effects/SkImageFilters.h" +#include "src/gpu/effects/GrMatrixConvolutionEffect.h" #include "tools/ToolUtils.h" #include @@ -79,7 +80,7 @@ protected: return SkImageFilters::MatrixConvolution({3,3}, kernel.data(), /* gain */ 0.3f, /* bias */ SkIntToScalar(100), kernelOffset, tileMode, convolveAlpha, nullptr, cropRect); } case kLarge_KernelFixture: { - // Intentionally go over the MAX_KERNEL_SIZE limit and trigger CPU fallback. + static_assert(49 > GrMatrixConvolutionEffect::kMaxUniformSize); // All 1s except center value, which is -47 (sum of 1). std::vector kernel(49, SkIntToScalar(1)); kernel[24] = SkIntToScalar(-47); diff --git a/src/core/SkGpuBlurUtils.cpp b/src/core/SkGpuBlurUtils.cpp index 19abd30fb9..e578ad6d7b 100644 --- a/src/core/SkGpuBlurUtils.cpp +++ b/src/core/SkGpuBlurUtils.cpp @@ -128,8 +128,9 @@ static std::unique_ptr convolve_gaussian_2d(GrRecordingCo SkIPoint kernelOffset = SkIPoint::Make(radiusX, radiusY); GrPaint paint; auto wm = SkTileModeToWrapMode(mode); - auto conv = GrMatrixConvolutionEffect::MakeGaussian(std::move(srcView), srcBounds, size, 1.0, - 0.0, kernelOffset, wm, true, sigmaX, sigmaY, + auto conv = GrMatrixConvolutionEffect::MakeGaussian(context, std::move(srcView), srcBounds, + size, 1.0, 0.0, kernelOffset, wm, true, + sigmaX, sigmaY, *renderTargetContext->caps()); paint.addColorFragmentProcessor(std::move(conv)); paint.setPorterDuffXPFactory(SkBlendMode::kSrc); @@ -453,7 +454,8 @@ std::unique_ptr GaussianBlur(GrRecordingContext* context, if (scaleFactorX == 1 && scaleFactorY == 1) { // For really small blurs (certainly no wider than 5x5 on desktop GPUs) it is faster to just // launch a single non separable kernel vs two launches. - if (sigmaX > 0 && sigmaY > 0 && (2 * radiusX + 1) * (2 * radiusY + 1) <= MAX_KERNEL_SIZE) { + const int kernelSize = (2 * radiusX + 1) * (2 * radiusY + 1); + if (sigmaX > 0 && sigmaY > 0 && kernelSize <= GrMatrixConvolutionEffect::kMaxUniformSize) { // Apply the proxy offset to src bounds and offset directly return convolve_gaussian_2d(context, std::move(srcView), srcColorType, srcBounds, dstBounds, radiusX, radiusY, sigmaX, sigmaY, mode, diff --git a/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp b/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp index 8b242c8923..0bac7ebcc6 100644 --- a/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp +++ b/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp @@ -391,9 +391,7 @@ sk_sp SkMatrixConvolutionImageFilterImpl::onFilterImage(const Co } #if SK_SUPPORT_GPU - // Note: if the kernel is too big, the GPU path falls back to SW - if (ctx.gpuBacked() && - fKernelSize.width() * fKernelSize.height() <= MAX_KERNEL_SIZE) { + if (ctx.gpuBacked()) { auto context = ctx.getContext(); // Ensure the input is in the destination color space. Typically applyCropRect will have @@ -414,7 +412,8 @@ sk_sp SkMatrixConvolutionImageFilterImpl::onFilterImage(const Co // Map srcBounds from input's logical image domain to that of the proxy srcBounds.offset(input->subset().x(), input->subset().y()); - auto fp = GrMatrixConvolutionEffect::Make(std::move(inputView), + auto fp = GrMatrixConvolutionEffect::Make(context, + std::move(inputView), srcBounds, fKernelSize, fKernel, diff --git a/src/gpu/GrFragmentProcessor.h b/src/gpu/GrFragmentProcessor.h index 194bb296e1..efc4c9ac78 100644 --- a/src/gpu/GrFragmentProcessor.h +++ b/src/gpu/GrFragmentProcessor.h @@ -490,6 +490,8 @@ public: TextureSampler(GrSurfaceProxyView, GrSamplerState = {}); + TextureSampler(TextureSampler&&) = default; + TextureSampler& operator=(TextureSampler&&) = default; TextureSampler& operator=(const TextureSampler&) = delete; bool operator==(const TextureSampler& that) const { diff --git a/src/gpu/effects/GrMatrixConvolutionEffect.cpp b/src/gpu/effects/GrMatrixConvolutionEffect.cpp index 8bb5a564f3..3736996046 100644 --- a/src/gpu/effects/GrMatrixConvolutionEffect.cpp +++ b/src/gpu/effects/GrMatrixConvolutionEffect.cpp @@ -6,6 +6,11 @@ */ #include "src/gpu/effects/GrMatrixConvolutionEffect.h" +#include "include/private/SkHalf.h" +#include "src/gpu/GrBitmapTextureMaker.h" +#include "src/gpu/GrContextPriv.h" +#include "src/gpu/GrProxyProvider.h" +#include "src/gpu/GrRecordingContextPriv.h" #include "src/gpu/GrTexture.h" #include "src/gpu/GrTextureProxy.h" #include "src/gpu/effects/GrTextureEffect.h" @@ -26,34 +31,195 @@ protected: private: typedef GrGLSLProgramDataManager::UniformHandle UniformHandle; + void emitKernelBlock(EmitArgs&, SkIPoint); + UniformHandle fKernelUni; UniformHandle fKernelOffsetUni; UniformHandle fGainUni; UniformHandle fBiasUni; + UniformHandle fKernelBiasUni; typedef GrGLSLFragmentProcessor INHERITED; }; +GrMatrixConvolutionEffect::KernelWrapper GrMatrixConvolutionEffect::KernelWrapper::Make( + GrRecordingContext* context, SkISize size, const GrCaps& caps, const SkScalar* values) { + if (nullptr == context || nullptr == values || size.isEmpty()) { + return {}; + } + const int length = size.area(); + // Small kernel -> just fill the array. + KernelWrapper result(size); + if (length <= kMaxUniformSize) { + for (int i = 0; i < length; i++) { + result.fArray[i] = SkScalarToFloat(values[i]); + } + return result; + } + + ScalableSampler& scalableSampler = result.fScalableSampler; + bool useA16 = + context->defaultBackendFormat(kA16_float_SkColorType, GrRenderable::kNo).isValid(); + SkScalar min = values[0]; + if (!useA16) { + // Determine min and max values to figure out inner gain & bias. + SkScalar max = values[0]; + for (int i = 1; i < length; i++) { + if (values[i] < min) { + min = values[i]; + } + if (values[i] > max) { + max = values[i]; + } + } + // Treat near-0 gain (i.e. box blur) as 1, and let the kernelBias + // move everything up to the final value. + const SkScalar computedGain = max - min; + scalableSampler.fGain = + SkScalarNearlyZero(computedGain) ? 1.0f : SkScalarToFloat(computedGain); + // Inner bias is pre-inner-gain so we divide that out. + scalableSampler.fBias = SkScalarToFloat(min) / scalableSampler.fGain; + } + + // TODO: Enable kernel caching and check perf. + static constexpr bool kCacheKernelTexture = false; + + GrUniqueKey key; + if (kCacheKernelTexture) { + static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain(); + GrUniqueKey::Builder builder(&key, kDomain, length, "Matrix Convolution Kernel"); + // Texture cache key is the exact content of the kernel. + static_assert(sizeof(float) == 4); + for (int i = 0; i < length; i++) { + builder[i] = *(const uint32_t*)&values[i]; + } + builder.finish(); + } + + // Find or create a texture. + GrProxyProvider* proxyProvider = context->priv().proxyProvider(); + GrSurfaceProxyView view; + SkColorType colorType = useA16 ? kA16_float_SkColorType : kAlpha_8_SkColorType; + sk_sp cachedKernel; + if (kCacheKernelTexture && (cachedKernel = proxyProvider->findOrCreateProxyByUniqueKey(key))) { + GrSwizzle swizzle = + context->priv().caps()->getReadSwizzle(cachedKernel->backendFormat(), + SkColorTypeToGrColorType(colorType)); + view = {std::move(cachedKernel), kTopLeft_GrSurfaceOrigin, swizzle}; + } else { + SkBitmap bm; + auto info = SkImageInfo::Make({(int)GrNextPow2(length), 1}, colorType, + kPremul_SkAlphaType, nullptr); + if (!bm.tryAllocPixels(info)) { + return {}; + } + for (int i = 0; i < length; i++) { + if (useA16) { + *bm.getAddr16(i, 0) = SkFloatToHalf(values[i]); + } else { + *bm.getAddr8(i, 0) = + SkScalarRoundToInt((values[i] - min) / scalableSampler.fGain * 255); + } + } + bm.setImmutable(); + GrBitmapTextureMaker maker(context, bm, GrImageTexGenPolicy::kNew_Uncached_Budgeted); + view = maker.view(GrMipMapped::kNo); + if (!view) { + return {}; + } + if (kCacheKernelTexture) { + proxyProvider->assignUniqueKeyToProxy(key, view.asTextureProxy()); + } + } + scalableSampler.fSampler = { std::move(view) }; + return result; +} + +bool GrMatrixConvolutionEffect::KernelWrapper::operator==(const KernelWrapper& k) const { + if (fSize != k.fSize) { + return false; + } else if (this->isSampled()) { + return fScalableSampler == k.fScalableSampler; + } else { + return std::equal(fArray.begin(), fArray.begin() + fSize.area(), k.fArray.begin()); + } +} + +bool GrMatrixConvolutionEffect::KernelWrapper::ScalableSampler::operator==( + const ScalableSampler& k) const { + return fSampler == k.fSampler && fGain == k.fGain && fBias == k.fBias; +} + +// For sampled kernels, emit a for loop that does all the kernel accumulation. +// For uniform kernels, emit a single iteration. Function is called repeatedly in a for loop. +// loc is ignored for sampled kernels. +void GrGLMatrixConvolutionEffect::emitKernelBlock(EmitArgs& args, SkIPoint loc) { + const GrMatrixConvolutionEffect& mce = args.fFp.cast(); + GrGLSLFPFragmentBuilder* fragBuilder = args.fFragBuilder; + GrGLSLUniformHandler* uniformHandler = args.fUniformHandler; + int kernelWidth = mce.kernelSize().width(); + int kernelHeight = mce.kernelSize().height(); + int kernelArea = kernelWidth * kernelHeight; + + if (mce.kernelIsSampled()) { + fragBuilder->codeAppendf("half2 kernelCoord = half2(0, 0);"); + fragBuilder->codeAppendf("for (int i = 0; i < %d; ++i)", (int)kernelArea); + } + + GrGLSLShaderBuilder::ShaderBlock block(fragBuilder); + + fragBuilder->codeAppend("half k;"); + fragBuilder->codeAppend("half2 sourceOffset;"); + if (mce.kernelIsSampled()) { + const char* kernelBias = uniformHandler->getUniformCStr(fKernelBiasUni); + fragBuilder->codeAppend("k = "); + fragBuilder->appendTextureLookup(args.fTexSamplers[0], "kernelCoord"); + fragBuilder->codeAppendf(".w + %s;", kernelBias); + fragBuilder->codeAppendf("sourceOffset.y = floor(i / %d);", kernelWidth); + fragBuilder->codeAppendf("sourceOffset.x = i - sourceOffset.y * %d;", kernelWidth); + float kernelStride = 1.0f / (float)GrNextPow2(kernelArea); + fragBuilder->codeAppendf("kernelCoord.x += %f;", kernelStride); + } else { + fragBuilder->codeAppendf("sourceOffset = half2(%d, %d);", loc.x(), loc.y()); + int offset = loc.y() * kernelWidth + loc.x(); + static constexpr const char kVecSuffix[][4] = { ".x", ".y", ".z", ".w" }; + const char* kernel = uniformHandler->getUniformCStr(fKernelUni); + fragBuilder->codeAppendf("k = %s[%d]%s;", kernel, offset / 4, + kVecSuffix[offset & 0x3]); + } + + auto sample = this->invokeChild(0, args, "coord + sourceOffset"); + fragBuilder->codeAppendf("half4 c = %s;", sample.c_str()); + if (!mce.convolveAlpha()) { + fragBuilder->codeAppend("c.rgb /= max(c.a, 0.0001);"); + fragBuilder->codeAppend("c.rgb = saturate(c.rgb);"); + } + fragBuilder->codeAppend("sum += c * k;"); +} + void GrGLMatrixConvolutionEffect::emitCode(EmitArgs& args) { const GrMatrixConvolutionEffect& mce = args.fFp.cast(); - int kWidth = mce.kernelSize().width(); - int kHeight = mce.kernelSize().height(); + int kernelWidth = mce.kernelSize().width(); + int kernelHeight = mce.kernelSize().height(); - int arrayCount = (kWidth * kHeight + 3) / 4; - SkASSERT(4 * arrayCount >= kWidth * kHeight); + int arrayCount = (kernelWidth * kernelHeight + 3) / 4; + SkASSERT(4 * arrayCount >= kernelWidth * kernelHeight); GrGLSLUniformHandler* uniformHandler = args.fUniformHandler; - fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag, kHalf4_GrSLType, - "Kernel", - arrayCount); + if (mce.kernelIsSampled()) { + fKernelBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, + kHalf_GrSLType, "KernelBias"); + } else { + fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag, + kHalf4_GrSLType, "Kernel", arrayCount); + } fKernelOffsetUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf2_GrSLType, "KernelOffset"); fGainUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Gain"); fBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Bias"); const char* kernelOffset = uniformHandler->getUniformCStr(fKernelOffsetUni); - const char* kernel = uniformHandler->getUniformCStr(fKernelUni); const char* gain = uniformHandler->getUniformCStr(fGainUni); const char* bias = uniformHandler->getUniformCStr(fBiasUni); @@ -62,27 +228,17 @@ void GrGLMatrixConvolutionEffect::emitCode(EmitArgs& args) { mce.sampleMatrix()); fragBuilder->codeAppend("half4 sum = half4(0, 0, 0, 0);"); fragBuilder->codeAppendf("float2 coord = %s - %s;", coords2D.c_str(), kernelOffset); - fragBuilder->codeAppend("half4 c;"); - const char* kVecSuffix[4] = { ".x", ".y", ".z", ".w" }; - for (int y = 0; y < kHeight; y++) { - for (int x = 0; x < kWidth; x++) { - GrGLSLShaderBuilder::ShaderBlock block(fragBuilder); - int offset = y*kWidth + x; - - fragBuilder->codeAppendf("half k = %s[%d]%s;", kernel, offset / 4, - kVecSuffix[offset & 0x3]); - SkSL::String coord; - coord.appendf("coord + half2(%d, %d)", x, y); - auto sample = this->invokeChild(0, args, coord); - fragBuilder->codeAppendf("half4 c = %s;", sample.c_str()); - if (!mce.convolveAlpha()) { - fragBuilder->codeAppend("c.rgb /= c.a;"); - fragBuilder->codeAppend("c.rgb = saturate(c.rgb);"); + if (mce.kernelIsSampled()) { + this->emitKernelBlock(args, {}); + } else { + for (int x = 0; x < kernelWidth; ++x) { + for (int y = 0; y < kernelHeight; ++y) { + this->emitKernelBlock(args, SkIPoint::Make(x, y)); } - fragBuilder->codeAppend("sum += c * k;"); } } + if (mce.convolveAlpha()) { fragBuilder->codeAppendf("%s = sum * %s + %s;", args.fOutputColor, gain, bias); fragBuilder->codeAppendf("%s.a = saturate(%s.a);", args.fOutputColor, args.fOutputColor); @@ -90,7 +246,7 @@ void GrGLMatrixConvolutionEffect::emitCode(EmitArgs& args) { args.fOutputColor, args.fOutputColor, args.fOutputColor); } else { auto sample = this->invokeChild(0, args, coords2D.c_str()); - fragBuilder->codeAppendf("c = %s;", sample.c_str()); + fragBuilder->codeAppendf("half4 c = %s;", sample.c_str()); fragBuilder->codeAppendf("%s.a = c.a;", args.fOutputColor); fragBuilder->codeAppendf("%s.rgb = saturate(sum.rgb * %s + %s);", args.fOutputColor, gain, bias); fragBuilder->codeAppendf("%s.rgb *= %s.a;", args.fOutputColor, args.fOutputColor); @@ -111,17 +267,22 @@ void GrGLMatrixConvolutionEffect::onSetData(const GrGLSLProgramDataManager& pdma const GrFragmentProcessor& processor) { const GrMatrixConvolutionEffect& conv = processor.cast(); pdman.set2fv(fKernelOffsetUni, 1, conv.kernelOffset().ptr()); - int kernelCount = conv.kernelSize().width() * conv.kernelSize().height(); - int arrayCount = (kernelCount + 3) / 4; - SkASSERT(4 * arrayCount >= kernelCount); - pdman.set4fv(fKernelUni, arrayCount, conv.kernel()); - pdman.set1f(fGainUni, conv.gain()); + float totalGain = conv.gain(); + if (conv.kernelIsSampled()) { + totalGain *= conv.kernelSampleGain(); + pdman.set1f(fKernelBiasUni, conv.kernelSampleBias()); + } else { + int kernelCount = conv.kernelSize().area(); + int arrayCount = (kernelCount + 3) / 4; + SkASSERT(4 * arrayCount >= kernelCount); + pdman.set4fv(fKernelUni, arrayCount, conv.kernel()); + } pdman.set1f(fBiasUni, conv.bias()); + pdman.set1f(fGainUni, totalGain); } GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(std::unique_ptr child, - const SkISize& kernelSize, - const SkScalar* kernel, + KernelWrapper kernel, SkScalar gain, SkScalar bias, const SkIPoint& kernelOffset, @@ -129,14 +290,14 @@ GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(std::unique_ptrsetSampledWithExplicitCoords(); this->registerChildProcessor(std::move(child)); - for (int i = 0; i < kernelSize.width() * kernelSize.height(); i++) { - fKernel[i] = SkScalarToFloat(kernel[i]); + if (fKernel.isSampled()) { + this->setTextureSamplerCnt(1); } fKernelOffset = {static_cast(kernelOffset.x()), static_cast(kernelOffset.y())}; @@ -145,7 +306,7 @@ GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(std::unique_ptrsetSampledWithExplicitCoords(); this->registerChildProcessor(std::move(child)); - std::copy_n(that.fKernel, fKernelSize.width() * fKernelSize.height(), fKernel); + if (fKernel.isSampled()) { + this->setTextureSamplerCnt(1); + } this->addCoordTransform(&fCoordTransform); } @@ -172,14 +335,18 @@ GrGLSLFragmentProcessor* GrMatrixConvolutionEffect::onCreateGLSLInstance() const bool GrMatrixConvolutionEffect::onIsEqual(const GrFragmentProcessor& sBase) const { const GrMatrixConvolutionEffect& s = sBase.cast(); - return fKernelSize == s.kernelSize() && - std::equal(fKernel, fKernel + fKernelSize.area(), s.fKernel) && + return fKernel == s.fKernel && fGain == s.gain() && fBias == s.bias() && fKernelOffset == s.kernelOffset() && fConvolveAlpha == s.convolveAlpha(); } +const GrFragmentProcessor::TextureSampler& GrMatrixConvolutionEffect::onTextureSampler( + int index) const { + return IthTextureSampler(index, fKernel.scalableSampler().fSampler); +} + static void fill_in_1D_gaussian_kernel_with_stride(float* kernel, int size, int stride, float twoSigmaSqrd) { SkASSERT(!SkScalarNearlyZero(twoSigmaSqrd, SK_ScalarNearlyZero)); @@ -204,7 +371,6 @@ static void fill_in_1D_gaussian_kernel_with_stride(float* kernel, int size, int static void fill_in_2D_gaussian_kernel(float* kernel, int width, int height, SkScalar sigmaX, SkScalar sigmaY) { - SkASSERT(width * height <= MAX_KERNEL_SIZE); const float twoSigmaSqrdX = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaX)); const float twoSigmaSqrdY = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaY)); @@ -260,7 +426,8 @@ static void fill_in_2D_gaussian_kernel(float* kernel, int width, int height, } } -std::unique_ptr GrMatrixConvolutionEffect::Make(GrSurfaceProxyView srcView, +std::unique_ptr GrMatrixConvolutionEffect::Make(GrRecordingContext* context, + GrSurfaceProxyView srcView, const SkIRect& srcBounds, const SkISize& kernelSize, const SkScalar* kernel, @@ -270,14 +437,19 @@ std::unique_ptr GrMatrixConvolutionEffect::Make(GrSurfacePr GrSamplerState::WrapMode wm, bool convolveAlpha, const GrCaps& caps) { + auto kw = KernelWrapper::Make(context, kernelSize, caps, kernel); + if (!kw.isValid()) { + return nullptr; + } GrSamplerState sampler(wm, GrSamplerState::Filter::kNearest); auto child = GrTextureEffect::MakeSubset(std::move(srcView), kPremul_SkAlphaType, SkMatrix::I(), sampler, SkRect::Make(srcBounds), caps); return std::unique_ptr(new GrMatrixConvolutionEffect( - std::move(child), kernelSize, kernel, gain, bias, kernelOffset, convolveAlpha)); + std::move(child), std::move(kw), gain, bias, kernelOffset, convolveAlpha)); } std::unique_ptr GrMatrixConvolutionEffect::MakeGaussian( + GrRecordingContext* context, GrSurfaceProxyView srcView, const SkIRect& srcBounds, const SkISize& kernelSize, @@ -289,11 +461,11 @@ std::unique_ptr GrMatrixConvolutionEffect::MakeGaussian( SkScalar sigmaX, SkScalar sigmaY, const GrCaps& caps) { - float kernel[MAX_KERNEL_SIZE]; - - fill_in_2D_gaussian_kernel(kernel, kernelSize.width(), kernelSize.height(), sigmaX, sigmaY); - return Make(std::move(srcView), srcBounds, kernelSize, kernel, gain, bias, kernelOffset, wm, - convolveAlpha, caps); + SkAutoSTMalloc<32, float> kernel(kernelSize.area()); + fill_in_2D_gaussian_kernel(kernel.get(), kernelSize.width(), kernelSize.height(), + sigmaX, sigmaY); + return Make(context, std::move(srcView), srcBounds, kernelSize, kernel.get(), + gain, bias, kernelOffset, wm, convolveAlpha, caps); } GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrMatrixConvolutionEffect); @@ -302,8 +474,9 @@ GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrMatrixConvolutionEffect); std::unique_ptr GrMatrixConvolutionEffect::TestCreate(GrProcessorTestData* d) { auto [view, ct, at] = d->randomView(); - int width = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE); - int height = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE / width); + static constexpr size_t kMaxTestKernelSize = 2 * kMaxUniformSize; + int width = d->fRandom->nextRangeU(1, kMaxTestKernelSize); + int height = d->fRandom->nextRangeU(1, kMaxTestKernelSize / width); SkISize kernelSize = SkISize::Make(width, height); std::unique_ptr kernel(new SkScalar[width * height]); for (int i = 0; i < width * height; i++) { @@ -325,8 +498,8 @@ std::unique_ptr GrMatrixConvolutionEffect::TestCreate(GrPro auto wm = static_cast( d->fRandom->nextULessThan(GrSamplerState::kWrapModeCount)); bool convolveAlpha = d->fRandom->nextBool(); - - return GrMatrixConvolutionEffect::Make(std::move(view), + return GrMatrixConvolutionEffect::Make(d->context()->priv().asRecordingContext(), + std::move(view), bounds, kernelSize, kernel.get(), diff --git a/src/gpu/effects/GrMatrixConvolutionEffect.h b/src/gpu/effects/GrMatrixConvolutionEffect.h index bb0a202800..4a5627bdec 100644 --- a/src/gpu/effects/GrMatrixConvolutionEffect.h +++ b/src/gpu/effects/GrMatrixConvolutionEffect.h @@ -9,15 +9,18 @@ #define GrMatrixConvolutionEffect_DEFINED #include "src/gpu/GrFragmentProcessor.h" - -// A little bit less than the minimum # uniforms required by DX9SM2 (32). -// Allows for a 5x5 kernel (or 28x1, for that matter). -// Must be a multiple of 4, since we upload these in vec4s. -#define MAX_KERNEL_SIZE 28 +#include +#include class GrMatrixConvolutionEffect : public GrFragmentProcessor { public: - static std::unique_ptr Make(GrSurfaceProxyView srcView, + // A little bit less than the minimum # uniforms required by DX9SM2 (32). + // Allows for a 5x5 kernel (or 28x1, for that matter). + // Must be a multiple of 4, since we upload these in vec4s. + static constexpr int kMaxUniformSize = 28; + + static std::unique_ptr Make(GrRecordingContext*, + GrSurfaceProxyView srcView, const SkIRect& srcBounds, const SkISize& kernelSize, const SkScalar* kernel, @@ -28,7 +31,8 @@ public: bool convolveAlpha, const GrCaps&); - static std::unique_ptr MakeGaussian(GrSurfaceProxyView srcView, + static std::unique_ptr MakeGaussian(GrRecordingContext*, + GrSurfaceProxyView srcView, const SkIRect& srcBounds, const SkISize& kernelSize, SkScalar gain, @@ -41,9 +45,12 @@ public: const GrCaps&); const SkIRect& bounds() const { return fBounds; } - const SkISize& kernelSize() const { return fKernelSize; } + SkISize kernelSize() const { return fKernel.size(); } const SkV2 kernelOffset() const { return fKernelOffset; } - const float* kernel() const { return fKernel; } + bool kernelIsSampled() const { return fKernel.isSampled(); } + const float *kernel() const { return fKernel.array().data(); } + float kernelSampleGain() const { return fKernel.scalableSampler().fGain; } + float kernelSampleBias() const { return fKernel.scalableSampler().fBias; } float gain() const { return fGain; } float bias() const { return fBias; } bool convolveAlpha() const { return fConvolveAlpha; } @@ -53,11 +60,74 @@ public: std::unique_ptr clone() const override; private: - // srcProxy is the texture that is going to be convolved - // srcBounds is the subset of 'srcProxy' that will be used (e.g., for clamp mode) - GrMatrixConvolutionEffect(std::unique_ptr, - const SkISize& kernelSize, - const SkScalar* kernel, + /** + * Small kernels are represented as float-arrays and uploaded as uniforms. + * Large kernels go over the uniform limit and are uploaded as textures and sampled. + * If Float16 textures are supported, we use those. Otherwise we use A8. + */ + class KernelWrapper { + public: + struct ScalableSampler { + TextureSampler fSampler; + // Only used in A8 mode. Applied before any other math. + float fBias = 0.0f; + // Only used in A8 mode. Premultiplied in with user gain to save time. + float fGain = 1.0f; + bool operator==(const ScalableSampler&) const; + }; + static KernelWrapper Make(GrRecordingContext*, SkISize, + const GrCaps&, const float* values); + + KernelWrapper(KernelWrapper&& that) : fSize(that.fSize) { + if (that.isSampled()) { + new (&fScalableSampler) ScalableSampler(std::move(that.fScalableSampler)); + } else { + new (&fArray) std::array(std::move(that.fArray)); + } + } + KernelWrapper(const KernelWrapper& that) : fSize(that.fSize) { + if (that.isSampled()) { + new (&fScalableSampler) ScalableSampler(that.fScalableSampler); + } else { + new (&fArray) std::array(that.fArray); + } + } + ~KernelWrapper() { + if (this->isSampled()) { + fScalableSampler.~ScalableSampler(); + } + } + + bool isValid() const { return !fSize.isEmpty(); } + SkISize size() const { return fSize; } + bool isSampled() const { return fSize.area() > kMaxUniformSize; } + const std::array& array() const { + SkASSERT(!this->isSampled()); + return fArray; + } + const ScalableSampler& scalableSampler() const { + SkASSERT(this->isSampled()); + return fScalableSampler; + } + bool operator==(const KernelWrapper&) const; + + private: + KernelWrapper() : fSize({}) {} + KernelWrapper(SkISize size) : fSize(size) { + if (this->isSampled()) { + new (&fScalableSampler) ScalableSampler; + } + } + + SkISize fSize; + union { + std::array fArray; + ScalableSampler fScalableSampler; + }; + }; + + GrMatrixConvolutionEffect(std::unique_ptr child, + KernelWrapper kernel, SkScalar gain, SkScalar bias, const SkIPoint& kernelOffset, @@ -71,12 +141,13 @@ private: bool onIsEqual(const GrFragmentProcessor&) const override; + const GrFragmentProcessor::TextureSampler& onTextureSampler(int index) const override; + // We really just want the unaltered local coords, but the only way to get that right now is // an identity coord transform. GrCoordTransform fCoordTransform = {}; SkIRect fBounds; - SkISize fKernelSize; - float fKernel[MAX_KERNEL_SIZE]; + KernelWrapper fKernel; float fGain; float fBias; SkV2 fKernelOffset;