cppgc: Use load/store instead of fetch_or

fetch_or (lock-prefixed-or on x86) is emulated with branching on armv7/armv8 and therefore generates more instructions. This improves android-binary-size by 45K. It should also improve performance. Bug: chromium:1238884 Change-Id: I48f59b645a4bb872b3798a1fde11608fd2930ce6 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3090342 Commit-Queue: Anton Bikineev <bikineev@chromium.org> Auto-Submit: Anton Bikineev <bikineev@chromium.org> Reviewed-by: Michael Lippautz <mlippautz@chromium.org> Cr-Commit-Position: refs/heads/master@{#76260}
2021-08-12 14:37:25 +02:00 · 2021-08-12 14:37:25 +02:00 · 2e006255ca
commit 2e006255ca
parent bdcda72cd1
1 changed files with 7 additions and 2 deletions
--- a/include/cppgc/allocation.h
+++ b/include/cppgc/allocation.h
@ -36,8 +36,13 @@ class V8_EXPORT MakeGarbageCollectedTraitInternal {
            const_cast<uint16_t*>(reinterpret_cast<const uint16_t*>(
                reinterpret_cast<const uint8_t*>(payload) -
                api_constants::kFullyConstructedBitFieldOffsetFromPayload)));
-    atomic_mutable_bitfield->fetch_or(api_constants::kFullyConstructedBitMask,
-                                      std::memory_order_release);
+    // It's safe to split use load+store here (instead of a read-modify-write
+    // operation), since it's guaranteed that this 16-bit bitfield is only
+    // modified by a single thread. This is cheaper in terms of code bloat (on
+    // ARM) and performance.
+    uint16_t value = atomic_mutable_bitfield->load(std::memory_order_relaxed);
+    value |= api_constants::kFullyConstructedBitMask;
+    atomic_mutable_bitfield->store(value, std::memory_order_release);
  }

  template <typename U, typename CustomSpace>