SkJumper: be more precise by rejecting data sections.

This allows %rip addressing as long as it's not going into a data section. This lets us use switch tables, avoiding loops and stack. On HSW, SkRasterPipeline_f16: 90 -> 63 SkRasterPipeline_srgb: 170 -> 97 Change-Id: I3ca2e4ff819b70beea78be75579f9d80c06979e8 Reviewed-on: https://skia-review.googlesource.com/9146 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
2017-03-02 11:16:22 -05:00 · 2017-03-02 11:16:22 -05:00 · 4e7fc0c5da
commit 4e7fc0c5da
parent b56dedf70b
4 changed files with 1495 additions and 634 deletions
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@ -240,11 +240,17 @@ static const size_t kStride = sizeof(F) / sizeof(float);
 template <typename V, typename T>
 static inline V load(const T* src, size_t tail) {
 #if defined(JUMPER)
+    __builtin_assume(tail < kStride);
    if (__builtin_expect(tail, 0)) {
        V v{};  // Any inactive lanes are zeroed.
-        #pragma nounroll
-        for (size_t i = 0; i < tail; i++) {
-            v[i] = src[i];
+        switch (tail-1) {
+            case 6: v[6] = src[6];
+            case 5: v[5] = src[5];
+            case 4: v[4] = src[4];
+            case 3: v[3] = src[3];
+            case 2: v[2] = src[2];
+            case 1: v[1] = src[1];
+            case 0: v[0] = src[0];
        }
        return v;
    }
@ -272,10 +278,16 @@ static inline V load(const T* src, size_t tail) {
 template <typename V, typename T>
 static inline void store(T* dst, V v, size_t tail) {
 #if defined(JUMPER)
+    __builtin_assume(tail < kStride);
    if (__builtin_expect(tail, 0)) {
-        #pragma nounroll
-        for (size_t i = 0; i < tail; i++) {
-            dst[i] = v[i];
+        switch (tail-1) {
+            case 6: dst[6] = v[6];
+            case 5: dst[5] = v[5];
+            case 4: dst[4] = v[4];
+            case 3: dst[3] = v[3];
+            case 2: dst[2] = v[2];
+            case 1: dst[1] = v[1];
+            case 0: dst[0] = v[0];
        }
        return;
    }
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@ -80,11 +80,20 @@ def parse_object_file(dot_o, directive, target=None):
  if directive != '.long':
    dehex = lambda h: str(int(h, 16))

-  cmd = [ objdump, '-d', '--insn-width=9', dot_o]
+  cmd = [objdump]
  if target:
    cmd += ['--target', target]

-  for line in subprocess.check_output(cmd).split('\n'):
+  # Look for sections we know we can't handle.
+  section_headers = subprocess.check_output(cmd + ['-h', dot_o])
+  for section in ['.literal4', '.literal8', '.literal16', '.const']:
+    if section in section_headers:
+      print >>sys.stderr, 'Found %s section, which we cannot handle.' % section
+      assert section not in section_headers
+
+  # Ok.  Let's disassemble.
+  disassemble = ['-d', '--insn-width=9', dot_o]
+  for line in subprocess.check_output(cmd + disassemble).split('\n'):
    line = line.strip()

    if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
@ -98,12 +107,6 @@ def parse_object_file(dot_o, directive, target=None):
      print '_' + m.group(1) + label
      continue

-    # ip-relative addressing usually means we're loading a constant,
-    # which we don't support.
-    if '%rip' in line:
-      print >>sys.stderr, line
-      assert '%rip' not in line
-
    columns = line.split('\t')
    code = columns[1]
    if len(columns) >= 4: