diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 4ba3f772e8..7948143a4c 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -3076,16 +3076,46 @@ static void start_pipeline(const size_t x0,     const size_t y0,
 
 // ~~~~~~ Commonly used helper functions ~~~~~~ //
 
+/**
+ * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
+ * thanks to a division by a non-power of two:
+ *   [1]  (v + 127) / 255
+ *
+ * There is a two-step process that computes the correct answer for all inputs:
+ *   [2]  (v + 128 + ((v + 128) >> 8)) >> 8
+ *
+ * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
+ *   [3]  (v + 255) >> 8;
+ *
+ * We offer two different implementations here, depending on the requirements of the calling stage.
+ */
+
+/**
+ * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
+ * as [3]), and uses [3] elsewhere.
+ */
 SI U16 div255(U16 v) {
-#if 0
-    return (v+127)/255;  // The ideal rounding divide by 255.
-#elif 1 && defined(JUMPER_IS_NEON)
-    // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
-    // just as fast as we can do the approximation below, so might as well be correct!
-    // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
+#if defined(JUMPER_IS_NEON)
+    // With NEON we can compute [2] just as fast as [3], so let's be correct.
+    // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
     return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
 #else
-    return (v+255)/256;  // A good approximation of (v+127)/255.
+    // Otherwise, use [3], which is never wrong by more than 1:
+    return (v+255)/256;
+#endif
+}
+
+/**
+ * div255_accurate guarantees the right answer on all platforms, at the expense of performance.
+ */
+SI U16 div255_accurate(U16 v) {
+#if defined(JUMPER_IS_NEON)
+    // Our NEON implementation of div255 is already correct for all inputs:
+    return div255(v);
+#else
+    // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
+    v += 128;
+    return (v+(v/256))/256;
 #endif
 }
 
@@ -3315,14 +3345,14 @@ STAGE_PP(clamp_gamut, Ctx::None) {
 }
 
 STAGE_PP(premul, Ctx::None) {
-    r = div255(r * a);
-    g = div255(g * a);
-    b = div255(b * a);
+    r = div255_accurate(r * a);
+    g = div255_accurate(g * a);
+    b = div255_accurate(b * a);
 }
 STAGE_PP(premul_dst, Ctx::None) {
-    dr = div255(dr * da);
-    dg = div255(dg * da);
-    db = div255(db * da);
+    dr = div255_accurate(dr * da);
+    dg = div255_accurate(dg * da);
+    db = div255_accurate(db * da);
 }
 
 STAGE_PP(force_opaque    , Ctx::None) {  a = 255; }
diff --git a/tests/PremulAlphaRoundTripTest.cpp b/tests/PremulAlphaRoundTripTest.cpp
index 38ec104801..61542f527f 100644
--- a/tests/PremulAlphaRoundTripTest.cpp
+++ b/tests/PremulAlphaRoundTripTest.cpp
@@ -9,6 +9,8 @@
 #include "include/core/SkCanvas.h"
 #include "include/core/SkSurface.h"
 #include "include/gpu/GrDirectContext.h"
+#include "src/core/SkConvertPixels.h"
+#include "src/gpu/GrPixmap.h"
 #include "tests/Test.h"
 #include "tools/ToolUtils.h"
 
@@ -105,3 +107,118 @@ DEF_GPUTEST_FOR_RENDERING_CONTEXTS(PremulAlphaRoundTrip_Gpu, reporter, ctxInfo)
                                                       SkBudgeted::kNo, info));
     test_premul_alpha_roundtrip(reporter, surf.get());
 }
+
+DEF_TEST(PremulAlphaRoundTripGrConvertPixels, reporter) {
+    // Code that does the same thing as above, but using GrConvertPixels. This simulates what
+    // happens if you run the above on a machine with a GPU that doesn't have a valid PM/UPM
+    // conversion pair of FPs.
+    const SkImageInfo upmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
+    const SkImageInfo pmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
+
+    GrPixmap src = GrPixmap::Allocate(upmInfo);
+    uint32_t* srcPixels = (uint32_t*)src.addr();
+    for (int y = 0; y < 256; ++y) {
+        for (int x = 0; x < 256; ++x) {
+            srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
+        }
+    }
+
+    GrPixmap surf = GrPixmap::Allocate(pmInfo);
+    GrConvertPixels(surf, src);
+
+    GrPixmap read1 = GrPixmap::Allocate(upmInfo);
+    GrConvertPixels(read1, surf);
+
+    GrPixmap surf2 = GrPixmap::Allocate(pmInfo);
+    GrConvertPixels(surf2, read1);
+
+    GrPixmap read2 = GrPixmap::Allocate(upmInfo);
+    GrConvertPixels(read2, surf2);
+
+    auto get_pixel = [](const GrPixmap& pm, int x, int y) {
+        const uint32_t* addr = (const uint32_t*)pm.addr();
+        return addr[y * 256 + x];
+    };
+    auto dump_pixel_history = [&](int x, int y) {
+        SkDebugf("Pixel history for (%d, %d):\n", x, y);
+        SkDebugf("Src : %08x\n", get_pixel(src, x, y));
+        SkDebugf(" -> : %08x\n", get_pixel(surf, x, y));
+        SkDebugf(" <- : %08x\n", get_pixel(read1, x, y));
+        SkDebugf(" -> : %08x\n", get_pixel(surf2, x, y));
+        SkDebugf(" <- : %08x\n", get_pixel(read2, x, y));
+    };
+
+    bool success = true;
+    for (int y = 0; y < 256 && success; ++y) {
+        const uint32_t* pixels1 = (const uint32_t*) read1.addr();
+        const uint32_t* pixels2 = (const uint32_t*) read2.addr();
+        for (int x = 0; x < 256 && success; ++x) {
+            uint32_t c1 = pixels1[y * 256 + x],
+                     c2 = pixels2[y * 256 + x];
+            // If this ever fails, it's helpful to see where it goes wrong.
+            if (c1 != c2) {
+                dump_pixel_history(x, y);
+            }
+            REPORTER_ASSERT(reporter, success = c1 == c2);
+        }
+    }
+}
+
+DEF_TEST(PremulAlphaRoundTripSkConvertPixels, reporter) {
+    // ... and now using SkConvertPixels, just for completeness
+    const SkImageInfo upmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
+    const SkImageInfo pmInfo =
+            SkImageInfo::Make(256, 256, kRGBA_8888_SkColorType, kPremul_SkAlphaType);
+
+    SkBitmap src; src.allocPixels(upmInfo);
+    uint32_t* srcPixels = src.getAddr32(0, 0);
+    for (int y = 0; y < 256; ++y) {
+        for (int x = 0; x < 256; ++x) {
+            srcPixels[y * 256 + x] = pack_unpremul_rgba(SkColorSetARGB(y, x, x, x));
+        }
+    }
+
+    auto convert = [](const SkBitmap& dst, const SkBitmap& src){
+        SkAssertResult(SkConvertPixels(dst.info(), dst.getAddr(0, 0), dst.rowBytes(),
+                                       src.info(), src.getAddr(0, 0), src.rowBytes()));
+    };
+
+    SkBitmap surf; surf.allocPixels(pmInfo);
+    convert(surf, src);
+
+    SkBitmap read1; read1.allocPixels(upmInfo);
+    convert(read1, surf);
+
+    SkBitmap surf2; surf2.allocPixels(pmInfo);
+    convert(surf2, read1);
+
+    SkBitmap read2; read2.allocPixels(upmInfo);
+    convert(read2, surf2);
+
+    auto dump_pixel_history = [&](int x, int y) {
+        SkDebugf("Pixel history for (%d, %d):\n", x, y);
+        SkDebugf("Src : %08x\n", *src.getAddr32(x, y));
+        SkDebugf(" -> : %08x\n", *surf.getAddr32(x, y));
+        SkDebugf(" <- : %08x\n", *read1.getAddr32(x, y));
+        SkDebugf(" -> : %08x\n", *surf2.getAddr32(x, y));
+        SkDebugf(" <- : %08x\n", *read2.getAddr32(x, y));
+    };
+
+    bool success = true;
+    for (int y = 0; y < 256 && success; ++y) {
+        const uint32_t* pixels1 = read1.getAddr32(0, 0);
+        const uint32_t* pixels2 = read2.getAddr32(0, 0);
+        for (int x = 0; x < 256 && success; ++x) {
+            uint32_t c1 = pixels1[y * 256 + x],
+                     c2 = pixels2[y * 256 + x];
+            // If this ever fails, it's helpful to see where it goes wrong.
+            if (c1 != c2) {
+                dump_pixel_history(x, y);
+            }
+            REPORTER_ASSERT(reporter, success = c1 == c2);
+        }
+    }
+}