From 750c2d07f7f317903ae11ab57c4b45b7b613f682 Mon Sep 17 00:00:00 2001
From: John Kessenich <cepheus@frii.com>
Date: Fri, 26 May 2017 00:01:36 -0600
Subject: [PATCH] SPV: When passing structs of opaque types, flatten and pass
 the members instead.

This avoids either A) needing uniformConstant struct, or
B) initializing a struct with opaque members, as writing them is not
allowed.
---
 Test/baseResults/hlsl.flattenOpaque.frag.out | 297 +++++++++++++++++++
 Test/hlsl.flattenOpaque.frag                 |  40 +++
 gtests/Hlsl.FromFile.cpp                     |   1 +
 hlsl/hlslParseHelper.cpp                     | 252 +++++++++++-----
 hlsl/hlslParseHelper.h                       |   6 +-
 hlsl/hlslParseables.cpp                      |   2 +-
 6 files changed, 528 insertions(+), 70 deletions(-)
 create mode 100755 Test/baseResults/hlsl.flattenOpaque.frag.out
 create mode 100644 Test/hlsl.flattenOpaque.frag

diff --git a/Test/baseResults/hlsl.flattenOpaque.frag.out b/Test/baseResults/hlsl.flattenOpaque.frag.out
new file mode 100755
index 000000000..392ff722a
--- /dev/null
+++ b/Test/baseResults/hlsl.flattenOpaque.frag.out
@@ -0,0 +1,297 @@
+hlsl.flattenOpaque.frag
+Shader version: 500
+gl_FragCoord origin is upper left
+0:? Sequence
+0:15  Function Definition: osCall1(struct-os-p11; ( temp 4-component vector of float)
+0:15    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?     Sequence
+0:16      Branch: Return with expression
+0:16        texture ( temp 4-component vector of float)
+0:16          Construct combined texture-sampler ( temp sampler2D)
+0:16            'tex' ( uniform texture2D)
+0:?             's2D' ( in sampler)
+0:?           Constant:
+0:?             0.200000
+0:?             0.300000
+0:20  Function Definition: osCall2(struct-os-p11;vf2; ( temp 4-component vector of float)
+0:20    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:20      'f2' ( in 2-component vector of float)
+0:?     Sequence
+0:21      Branch: Return with expression
+0:21        texture ( temp 4-component vector of float)
+0:21          Construct combined texture-sampler ( temp sampler2D)
+0:21            'tex' ( uniform texture2D)
+0:?             's2D' ( in sampler)
+0:21          'f2' ( in 2-component vector of float)
+0:25  Function Definition: os2Call1(struct-os2-p1-t211; ( temp 4-component vector of float)
+0:25    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?       'tex' ( in texture2D)
+0:?     Sequence
+0:26      Branch: Return with expression
+0:26        texture ( temp 4-component vector of float)
+0:26          Construct combined texture-sampler ( temp sampler2D)
+0:?             'tex' ( in texture2D)
+0:?             's2D' ( in sampler)
+0:?           Constant:
+0:?             0.200000
+0:?             0.300000
+0:30  Function Definition: os2Call2(struct-os2-p1-t211;vf2; ( temp 4-component vector of float)
+0:30    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?       'tex' ( in texture2D)
+0:30      'f2' ( in 2-component vector of float)
+0:?     Sequence
+0:31      Branch: Return with expression
+0:31        texture ( temp 4-component vector of float)
+0:31          Construct combined texture-sampler ( temp sampler2D)
+0:?             'tex' ( in texture2D)
+0:?             's2D' ( in sampler)
+0:31          'f2' ( in 2-component vector of float)
+0:35  Function Definition: @main( ( temp 4-component vector of float)
+0:35    Function Parameters: 
+0:?     Sequence
+0:39      Branch: Return with expression
+0:38        add ( temp 4-component vector of float)
+0:37          add ( temp 4-component vector of float)
+0:36            add ( temp 4-component vector of float)
+0:36              Function Call: osCall1(struct-os-p11; ( temp 4-component vector of float)
+0:?                 's2D' ( uniform sampler)
+0:37              Function Call: osCall2(struct-os-p11;vf2; ( temp 4-component vector of float)
+0:?                 's2D' ( uniform sampler)
+0:?                 Constant:
+0:?                   0.200000
+0:?                   0.300000
+0:38            Function Call: os2Call1(struct-os2-p1-t211; ( temp 4-component vector of float)
+0:?               's2D' ( uniform sampler)
+0:?               'tex' ( uniform texture2D)
+0:39          Function Call: os2Call2(struct-os2-p1-t211;vf2; ( temp 4-component vector of float)
+0:?             's2D' ( uniform sampler)
+0:?             'tex' ( uniform texture2D)
+0:?             Constant:
+0:?               0.200000
+0:?               0.300000
+0:35  Function Definition: main( ( temp void)
+0:35    Function Parameters: 
+0:?     Sequence
+0:35      move second child to first child ( temp 4-component vector of float)
+0:?         '@entryPointOutput' (layout( location=0) out 4-component vector of float)
+0:35        Function Call: @main( ( temp 4-component vector of float)
+0:?   Linker Objects
+0:?     'tex' ( uniform texture2D)
+0:?     '@entryPointOutput' (layout( location=0) out 4-component vector of float)
+
+
+Linked fragment stage:
+
+
+Shader version: 500
+gl_FragCoord origin is upper left
+0:? Sequence
+0:15  Function Definition: osCall1(struct-os-p11; ( temp 4-component vector of float)
+0:15    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?     Sequence
+0:16      Branch: Return with expression
+0:16        texture ( temp 4-component vector of float)
+0:16          Construct combined texture-sampler ( temp sampler2D)
+0:16            'tex' ( uniform texture2D)
+0:?             's2D' ( in sampler)
+0:?           Constant:
+0:?             0.200000
+0:?             0.300000
+0:20  Function Definition: osCall2(struct-os-p11;vf2; ( temp 4-component vector of float)
+0:20    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:20      'f2' ( in 2-component vector of float)
+0:?     Sequence
+0:21      Branch: Return with expression
+0:21        texture ( temp 4-component vector of float)
+0:21          Construct combined texture-sampler ( temp sampler2D)
+0:21            'tex' ( uniform texture2D)
+0:?             's2D' ( in sampler)
+0:21          'f2' ( in 2-component vector of float)
+0:25  Function Definition: os2Call1(struct-os2-p1-t211; ( temp 4-component vector of float)
+0:25    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?       'tex' ( in texture2D)
+0:?     Sequence
+0:26      Branch: Return with expression
+0:26        texture ( temp 4-component vector of float)
+0:26          Construct combined texture-sampler ( temp sampler2D)
+0:?             'tex' ( in texture2D)
+0:?             's2D' ( in sampler)
+0:?           Constant:
+0:?             0.200000
+0:?             0.300000
+0:30  Function Definition: os2Call2(struct-os2-p1-t211;vf2; ( temp 4-component vector of float)
+0:30    Function Parameters: 
+0:?       's2D' ( in sampler)
+0:?       'tex' ( in texture2D)
+0:30      'f2' ( in 2-component vector of float)
+0:?     Sequence
+0:31      Branch: Return with expression
+0:31        texture ( temp 4-component vector of float)
+0:31          Construct combined texture-sampler ( temp sampler2D)
+0:?             'tex' ( in texture2D)
+0:?             's2D' ( in sampler)
+0:31          'f2' ( in 2-component vector of float)
+0:35  Function Definition: @main( ( temp 4-component vector of float)
+0:35    Function Parameters: 
+0:?     Sequence
+0:39      Branch: Return with expression
+0:38        add ( temp 4-component vector of float)
+0:37          add ( temp 4-component vector of float)
+0:36            add ( temp 4-component vector of float)
+0:36              Function Call: osCall1(struct-os-p11; ( temp 4-component vector of float)
+0:?                 's2D' ( uniform sampler)
+0:37              Function Call: osCall2(struct-os-p11;vf2; ( temp 4-component vector of float)
+0:?                 's2D' ( uniform sampler)
+0:?                 Constant:
+0:?                   0.200000
+0:?                   0.300000
+0:38            Function Call: os2Call1(struct-os2-p1-t211; ( temp 4-component vector of float)
+0:?               's2D' ( uniform sampler)
+0:?               'tex' ( uniform texture2D)
+0:39          Function Call: os2Call2(struct-os2-p1-t211;vf2; ( temp 4-component vector of float)
+0:?             's2D' ( uniform sampler)
+0:?             'tex' ( uniform texture2D)
+0:?             Constant:
+0:?               0.200000
+0:?               0.300000
+0:35  Function Definition: main( ( temp void)
+0:35    Function Parameters: 
+0:?     Sequence
+0:35      move second child to first child ( temp 4-component vector of float)
+0:?         '@entryPointOutput' (layout( location=0) out 4-component vector of float)
+0:35        Function Call: @main( ( temp 4-component vector of float)
+0:?   Linker Objects
+0:?     'tex' ( uniform texture2D)
+0:?     '@entryPointOutput' (layout( location=0) out 4-component vector of float)
+
+// Module Version 10000
+// Generated by (magic number): 80001
+// Id's are bound by 85
+
+                              Capability Shader
+               1:             ExtInstImport  "GLSL.std.450"
+                              MemoryModel Logical GLSL450
+                              EntryPoint Fragment 4  "main" 83
+                              ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
+                              Name 4  "main"
+                              Name 12  "osCall1(struct-os-p11;"
+                              Name 11  "s2D"
+                              Name 19  "osCall2(struct-os-p11;vf2;"
+                              Name 17  "s2D"
+                              Name 18  "f2"
+                              Name 26  "os2Call1(struct-os2-p1-t211;"
+                              Name 24  "s2D"
+                              Name 25  "tex"
+                              Name 32  "os2Call2(struct-os2-p1-t211;vf2;"
+                              Name 29  "s2D"
+                              Name 30  "tex"
+                              Name 31  "f2"
+                              Name 35  "@main("
+                              Name 37  "tex"
+                              Name 68  "s2D"
+                              Name 70  "param"
+                              Name 73  "s2D"
+                              Name 74  "tex"
+                              Name 77  "param"
+                              Name 83  "@entryPointOutput"
+                              Decorate 37(tex) DescriptorSet 0
+                              Decorate 68(s2D) DescriptorSet 0
+                              Decorate 73(s2D) DescriptorSet 0
+                              Decorate 74(tex) DescriptorSet 0
+                              Decorate 83(@entryPointOutput) Location 0
+               2:             TypeVoid
+               3:             TypeFunction 2
+               6:             TypeSampler
+               7:             TypePointer UniformConstant 6
+               8:             TypeFloat 32
+               9:             TypeVector 8(float) 4
+              10:             TypeFunction 9(fvec4) 7(ptr)
+              14:             TypeVector 8(float) 2
+              15:             TypePointer Function 14(fvec2)
+              16:             TypeFunction 9(fvec4) 7(ptr) 15(ptr)
+              21:             TypeImage 8(float) 2D sampled format:Unknown
+              22:             TypePointer UniformConstant 21
+              23:             TypeFunction 9(fvec4) 7(ptr) 22(ptr)
+              28:             TypeFunction 9(fvec4) 7(ptr) 22(ptr) 15(ptr)
+              34:             TypeFunction 9(fvec4)
+         37(tex):     22(ptr) Variable UniformConstant
+              40:             TypeSampledImage 21
+              42:    8(float) Constant 1045220557
+              43:    8(float) Constant 1050253722
+              44:   14(fvec2) ConstantComposite 42 43
+         68(s2D):      7(ptr) Variable UniformConstant
+         73(s2D):      7(ptr) Variable UniformConstant
+         74(tex):     22(ptr) Variable UniformConstant
+              82:             TypePointer Output 9(fvec4)
+83(@entryPointOutput):     82(ptr) Variable Output
+         4(main):           2 Function None 3
+               5:             Label
+              84:    9(fvec4) FunctionCall 35(@main()
+                              Store 83(@entryPointOutput) 84
+                              Return
+                              FunctionEnd
+12(osCall1(struct-os-p11;):    9(fvec4) Function None 10
+         11(s2D):      7(ptr) FunctionParameter
+              13:             Label
+              38:          21 Load 37(tex)
+              39:           6 Load 11(s2D)
+              41:          40 SampledImage 38 39
+              45:    9(fvec4) ImageSampleImplicitLod 41 44
+                              ReturnValue 45
+                              FunctionEnd
+19(osCall2(struct-os-p11;vf2;):    9(fvec4) Function None 16
+         17(s2D):      7(ptr) FunctionParameter
+          18(f2):     15(ptr) FunctionParameter
+              20:             Label
+              48:          21 Load 37(tex)
+              49:           6 Load 17(s2D)
+              50:          40 SampledImage 48 49
+              51:   14(fvec2) Load 18(f2)
+              52:    9(fvec4) ImageSampleImplicitLod 50 51
+                              ReturnValue 52
+                              FunctionEnd
+26(os2Call1(struct-os2-p1-t211;):    9(fvec4) Function None 23
+         24(s2D):      7(ptr) FunctionParameter
+         25(tex):     22(ptr) FunctionParameter
+              27:             Label
+              55:          21 Load 25(tex)
+              56:           6 Load 24(s2D)
+              57:          40 SampledImage 55 56
+              58:    9(fvec4) ImageSampleImplicitLod 57 44
+                              ReturnValue 58
+                              FunctionEnd
+32(os2Call2(struct-os2-p1-t211;vf2;):    9(fvec4) Function None 28
+         29(s2D):      7(ptr) FunctionParameter
+         30(tex):     22(ptr) FunctionParameter
+          31(f2):     15(ptr) FunctionParameter
+              33:             Label
+              61:          21 Load 30(tex)
+              62:           6 Load 29(s2D)
+              63:          40 SampledImage 61 62
+              64:   14(fvec2) Load 31(f2)
+              65:    9(fvec4) ImageSampleImplicitLod 63 64
+                              ReturnValue 65
+                              FunctionEnd
+      35(@main():    9(fvec4) Function None 34
+              36:             Label
+       70(param):     15(ptr) Variable Function
+       77(param):     15(ptr) Variable Function
+              69:    9(fvec4) FunctionCall 12(osCall1(struct-os-p11;) 68(s2D)
+                              Store 70(param) 44
+              71:    9(fvec4) FunctionCall 19(osCall2(struct-os-p11;vf2;) 68(s2D) 70(param)
+              72:    9(fvec4) FAdd 69 71
+              75:    9(fvec4) FunctionCall 26(os2Call1(struct-os2-p1-t211;) 73(s2D) 74(tex)
+              76:    9(fvec4) FAdd 72 75
+                              Store 77(param) 44
+              78:    9(fvec4) FunctionCall 32(os2Call2(struct-os2-p1-t211;vf2;) 73(s2D) 74(tex) 77(param)
+              79:    9(fvec4) FAdd 76 78
+                              ReturnValue 79
+                              FunctionEnd
diff --git a/Test/hlsl.flattenOpaque.frag b/Test/hlsl.flattenOpaque.frag
new file mode 100644
index 000000000..279be8a0b
--- /dev/null
+++ b/Test/hlsl.flattenOpaque.frag
@@ -0,0 +1,40 @@
+struct os {
+    sampler2D s2D;
+};
+
+struct os2 {
+    sampler2D s2D;
+    Texture2D tex;
+};
+
+Texture2D tex;
+os s;
+os2 s2;
+
+float4 osCall1(os s)
+{
+    return tex.Sample(s.s2D, float2(0.2, 0.3));
+}
+
+float4 osCall2(os s, float2 f2)
+{
+    return tex.Sample(s.s2D, f2);
+}
+
+float4 os2Call1(os2 s)
+{
+    return s.tex.Sample(s.s2D, float2(0.2, 0.3));
+}
+
+float4 os2Call2(os2 s, float2 f2)
+{
+    return s.tex.Sample(s.s2D, f2);
+}
+
+float4 main() : SV_TARGET0
+{
+    return osCall1(s) +
+           osCall2(s, float2(0.2, 0.3)) +
+           os2Call1(s2) +
+           os2Call2(s2, float2(0.2, 0.3));
+}
diff --git a/gtests/Hlsl.FromFile.cpp b/gtests/Hlsl.FromFile.cpp
index 87c830251..91d7ae61a 100644
--- a/gtests/Hlsl.FromFile.cpp
+++ b/gtests/Hlsl.FromFile.cpp
@@ -114,6 +114,7 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.float1.frag", "PixelShaderFunction"},
         {"hlsl.float4.frag", "PixelShaderFunction"},
         {"hlsl.flatten.return.frag", "main"},
+        {"hlsl.flattenOpaque.frag", "main"},
         {"hlsl.forLoop.frag", "PixelShaderFunction"},
         {"hlsl.gather.array.dx10.frag", "main"},
         {"hlsl.gather.basic.dx10.frag", "main"},
diff --git a/hlsl/hlslParseHelper.cpp b/hlsl/hlslParseHelper.cpp
index a105b84dc..43949b51b 100755
--- a/hlsl/hlslParseHelper.cpp
+++ b/hlsl/hlslParseHelper.cpp
@@ -774,7 +774,7 @@ TIntermTyped* HlslParseContext::handleBracketDereference(const TSourceLoc& loc,
     else {
         // at least one of base and index is variable...
 
-        if (base->getAsSymbolNode() && (wasFlattened(base) || shouldFlattenUniform(base->getType()))) {
+        if (base->getAsSymbolNode() && (wasFlattened(base) || shouldFlatten(base->getType()))) {
             if (index->getQualifier().storage != EvqConst)
                 error(loc, "Invalid variable index to flattened array", base->getAsSymbolNode()->getName().c_str(), "");
 
@@ -981,7 +981,7 @@ TIntermTyped* HlslParseContext::handleDotDereference(const TSourceLoc& loc, TInt
             }
         }
         if (fieldFound) {
-            if (base->getAsSymbolNode() && (wasFlattened(base) || shouldFlattenUniform(base->getType()))) {
+            if (base->getAsSymbolNode() && (wasFlattened(base) || shouldFlatten(base->getType()))) {
                 result = flattenAccess(base, member);
             } else {
                 // Update the base and member to access if this was a split structure.
@@ -1115,14 +1115,13 @@ TType& HlslParseContext::split(TType& type, TString name, const TType* outerStru
     return type;
 }
 
-// Is this a uniform array which should be flattened?
-bool HlslParseContext::shouldFlattenUniform(const TType& type) const
+// Is this a uniform array or structure which should be flattened?
+bool HlslParseContext::shouldFlatten(const TType& type) const
 {
     const TStorageQualifier qualifier = type.getQualifier().storage;
 
-    return qualifier == EvqUniform &&
-        ((type.isArray() && intermediate.getFlattenUniformArrays()) || type.isStruct()) &&
-        type.containsOpaque();
+    return (qualifier == EvqUniform && type.isArray() && intermediate.getFlattenUniformArrays()) ||
+           type.isStruct() && type.containsOpaque();
 }
 
 // Top level variable flattening: construct data
@@ -1285,16 +1284,22 @@ bool HlslParseContext::wasSplit(const TIntermTyped* node) const
 // Turn an access into an aggregate that was flattened to instead be
 // an access to the individual variable the member was flattened to.
 // Assumes shouldFlatten() or equivalent was called first.
+// Also assumes that initFlattening() and finalizeFlattening() bracket usage.
 TIntermTyped* HlslParseContext::flattenAccess(TIntermTyped* base, int member)
 {
     const TType dereferencedType(base->getType(), member);  // dereferenced type
-
     const TIntermSymbol& symbolNode = *base->getAsSymbolNode();
 
-    const auto flattenData = flattenMap.find(symbolNode.getId());
+    TIntermTyped* flattened = flattenAccess(symbolNode.getId(), member, dereferencedType);
+
+    return flattened ? flattened : base;
+}
+TIntermTyped* HlslParseContext::flattenAccess(int uniqueId, int member, const TType& dereferencedType)
+{
+    const auto flattenData = flattenMap.find(uniqueId);
 
     if (flattenData == flattenMap.end())
-        return base;
+        return nullptr;
 
     // Calculate new cumulative offset from the packed tree
     flattenOffset.back() = flattenData->second.offsets[flattenOffset.back() + member];
@@ -1307,7 +1312,7 @@ TIntermTyped* HlslParseContext::flattenAccess(TIntermTyped* base, int member)
     } else {
         // If this is not the final flattening, accumulate the position and return
         // an object of the partially dereferenced type.
-        return new TIntermSymbol(symbolNode.getId(), "flattenShadow", dereferencedType);
+        return new TIntermSymbol(uniqueId, "flattenShadow", dereferencedType);
     }
 }
 
@@ -1663,15 +1668,32 @@ TIntermAggregate* HlslParseContext::handleFunctionDefinition(const TSourceLoc& l
                 symbolTable.makeInternalVariable(*variable);
                 pushImplicitThis(variable);
             }
+
             // Insert the parameters with name in the symbol table.
             if (! symbolTable.insert(*variable))
                 error(loc, "redefinition", variable->getName().c_str(), "");
-            // Add the parameter to the AST
-            paramNodes = intermediate.growAggregate(paramNodes,
-                                                    intermediate.addSymbol(*variable, loc),
-                                                    loc);
 
-            // Add hidden parameter for struct buffer counters, if needed.
+            // Add parameters to the AST list.
+            if (shouldFlatten(variable->getType())) {
+                // Expand the AST parameter nodes (but not the name mangling or symbol table view)
+                // for structures that need to be flattened.
+                flatten(loc, *variable);
+                const TTypeList* structure = variable->getType().getStruct();
+                for (int mem = 0; mem < (int)structure->size(); ++mem) {
+                    initFlattening();
+                    paramNodes = intermediate.growAggregate(paramNodes,
+                                                            flattenAccess(variable->getUniqueId(), mem, *(*structure)[mem].type),
+                                                            loc);
+                    finalizeFlattening();
+                }
+            } else {
+                // Add the parameter to the AST
+                paramNodes = intermediate.growAggregate(paramNodes,
+                                                        intermediate.addSymbol(*variable, loc),
+                                                        loc);
+            }
+
+            // Add hidden AST parameter for struct buffer counters, if needed.
             addStructBufferHiddenCounterParam(loc, param, paramNodes);
         } else
             paramNodes = intermediate.growAggregate(paramNodes, intermediate.addSymbol(*param.type, loc), loc);
@@ -2265,7 +2287,7 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
         const bool flattened      = isLeft ? isFlattenLeft : isFlattenRight;
         const bool split          = isLeft ? isSplitLeft : isSplitRight;
         const TIntermTyped* outer = isLeft ? outerLeft   : outerRight;
-        const TVector<TVariable*>& flatVariables      = isLeft ? *leftVariables : *rightVariables;
+        const TVector<TVariable*>& flatVariables = isLeft ? *leftVariables : *rightVariables;
 
         // Index operator if it's an aggregate, else EOpNull
         const TOperator op = node->getType().isArray()  ? EOpIndexDirect : 
@@ -2320,7 +2342,7 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
             const int elementsToCopy = std::min(elementsL, elementsR);
 
             // array case
-            for (int element=0; element < elementsToCopy; ++element) {
+            for (int element = 0; element < elementsToCopy; ++element) {
                 arrayElement.push_back(element);
 
                 // Add a new AST symbol node if we have a temp variable holding a complex RHS.
@@ -2511,7 +2533,7 @@ bool HlslParseContext::hasStructBuffCounter(const TType& type) const
     case EbvRWStructuredBuffer:  // ...
         return true;
     default:
-        return false; // the other structuredbfufer types do not have a counter.
+        return false; // the other structuredbuffer types do not have a counter.
     }
 }
 
@@ -4419,14 +4441,18 @@ TIntermTyped* HlslParseContext::handleFunctionCall(const TSourceLoc& loc, TFunct
                 pushFrontArguments(intermediate.addSymbol(*getImplicitThis(thisDepth)), arguments);
             }
 
-            // Convert 'in' arguments
+            // Convert 'in' arguments, so that types match.
+            // However, skip those that need expansion, that is covered next.
             if (arguments)
                 addInputArgumentConversions(*fnCandidate, arguments);
 
-            // If any argument is a pass-by-reference struct buffer with an associated counter
-            // buffer, we have to add another hidden parameter for that counter.
-            if (aggregate && !builtIn)
-                addStructBuffArguments(loc, aggregate);
+            // Expand arguments.  Some arguments must physically expand to a different set
+            // than what the shader declared and passes.
+            if (arguments && !builtIn)
+                expandArguments(loc, *fnCandidate, arguments);
+
+            // Expansion may have changed the form of arguments
+            aggregate = arguments ? arguments->getAsAggregate() : nullptr;
 
             op = fnCandidate->getBuiltInOp();
             if (builtIn && op != EOpNull) {
@@ -4464,24 +4490,35 @@ TIntermTyped* HlslParseContext::handleFunctionCall(const TSourceLoc& loc, TFunct
             decomposeSampleMethods(loc, result, arguments);       // HLSL->AST sample method decompositions
             decomposeGeometryMethods(loc, result, arguments);     // HLSL->AST geometry method decompositions
 
+            // Create the qualifier list, carried in the AST for the call.
+            // Because some arguments expand to multiple arguments, the qualifier list will
+            // be longer than the formal parameter list.
+            if (result == fnNode && result->getAsAggregate()) {
+                TQualifierList& qualifierList = result->getAsAggregate()->getQualifierList();
+                for (int i = 0; i < fnCandidate->getParamCount(); ++i) {
+                    TStorageQualifier qual = (*fnCandidate)[i].type->getQualifier().storage;
+                    if (hasStructBuffCounter(*(*fnCandidate)[i].type)) {
+                        // add buffer and counter buffer argument qualifier
+                        qualifierList.push_back(qual);
+                        qualifierList.push_back(qual);
+                    } else if (shouldFlatten(*(*fnCandidate)[i].type)) {
+                        // add structure member expansion
+                        for (int memb = 0; memb < (int)(*fnCandidate)[i].type->getStruct()->size(); ++memb)
+                            qualifierList.push_back(qual);
+                    } else {
+                        // Normal 1:1 case
+                        qualifierList.push_back(qual);
+                    }
+                }
+            }
+
             // Convert 'out' arguments.  If it was a constant folded built-in, it won't be an aggregate anymore.
             // Built-ins with a single argument aren't called with an aggregate, but they also don't have an output.
             // Also, build the qualifier list for user function calls, which are always called with an aggregate.
             // We don't do this is if there has been a decomposition, which will have added its own conversions
             // for output parameters.
-            if (result == fnNode && result->getAsAggregate()) {
-                TQualifierList& qualifierList = result->getAsAggregate()->getQualifierList();
-                for (int i = 0; i < fnCandidate->getParamCount(); ++i) {
-                    TStorageQualifier qual = (*fnCandidate)[i].type->getQualifier().storage;
-                    qualifierList.push_back(qual);
-
-                    // add counter buffer argument qualifier
-                    if (hasStructBuffCounter(*(*fnCandidate)[i].type))
-                        qualifierList.push_back(qual);
-                }
-
+            if (result == fnNode && result->getAsAggregate())
                 result = addOutputArgumentConversions(*fnCandidate, *result->getAsOperator());
-            }
         }
     }
 
@@ -4512,20 +4549,22 @@ void HlslParseContext::pushFrontArguments(TIntermTyped* front, TIntermTyped*& ar
 void HlslParseContext::addInputArgumentConversions(const TFunction& function, TIntermTyped*& arguments)
 {
     TIntermAggregate* aggregate = arguments->getAsAggregate();
-    const auto setArg = [&](int argNum, TIntermTyped* arg) {
+
+    // Replace a single argument with a single argument.
+    const auto setArg = [&](int paramNum, TIntermTyped* arg) {
         if (function.getParamCount() == 1)
             arguments = arg;
         else {
-            if (aggregate)
-                aggregate->getSequence()[argNum] = arg;
-            else
+            if (aggregate == nullptr)
                 arguments = arg;
+            else
+                aggregate->getSequence()[paramNum] = arg;
         }
     };
 
     // Process each argument's conversion
-    for (int i = 0; i < function.getParamCount(); ++i) {
-        if (! function[i].type->getQualifier().isParamInput())
+    for (int param = 0; param < function.getParamCount(); ++param) {
+        if (! function[param].type->getQualifier().isParamInput())
             continue;
 
         // At this early point there is a slight ambiguity between whether an aggregate 'arguments'
@@ -4533,42 +4572,121 @@ void HlslParseContext::addInputArgumentConversions(const TFunction& function, TI
         // means take 'arguments' itself as the one argument.
         TIntermTyped* arg = function.getParamCount() == 1
                                    ? arguments->getAsTyped()
-                                   : (aggregate ? aggregate->getSequence()[i]->getAsTyped() : arguments->getAsTyped());
-        if (*function[i].type != arg->getType()) {
+                                   : (aggregate ? 
+                                        aggregate->getSequence()[param]->getAsTyped() :
+                                        arguments->getAsTyped());
+        if (*function[param].type != arg->getType()) {
             // In-qualified arguments just need an extra node added above the argument to
             // convert to the correct type.
-            TIntermTyped* convArg = intermediate.addConversion(EOpFunctionCall, *function[i].type, arg);
+            TIntermTyped* convArg = intermediate.addConversion(EOpFunctionCall, *function[param].type, arg);
             if (convArg != nullptr)
-                convArg = intermediate.addUniShapeConversion(EOpFunctionCall, *function[i].type, convArg);
+                convArg = intermediate.addUniShapeConversion(EOpFunctionCall, *function[param].type, convArg);
             if (convArg != nullptr)
-                setArg(i, convArg);
+                setArg(param, convArg);
             else
-                error(arg->getLoc(), "cannot convert input argument, argument", "", "%d", i);
+                error(arg->getLoc(), "cannot convert input argument, argument", "", "%d", param);
         } else {
             if (wasFlattened(arg) || wasSplit(arg)) {
-                // Will make a two-level subtree.
-                // The deepest will copy member-by-member to build the structure to pass.
-                // The level above that will be a two-operand EOpComma sequence that follows the copy by the
-                // object itself.
-                TVariable* internalAggregate = makeInternalVariable("aggShadow", *function[i].type);
-                internalAggregate->getWritableType().getQualifier().makeTemporary();
-                TIntermSymbol* internalSymbolNode = new TIntermSymbol(internalAggregate->getUniqueId(),
-                                                                      internalAggregate->getName(),
-                                                                      internalAggregate->getType());
-                internalSymbolNode->setLoc(arg->getLoc());
-                // This makes the deepest level, the member-wise copy
-                TIntermAggregate* assignAgg = handleAssign(arg->getLoc(), EOpAssign, internalSymbolNode, arg)->getAsAggregate();
+                // If both formal and calling arg are to be flattened, leave that to argument
+                // expansion, not conversion.
+                if (!shouldFlatten(*function[param].type)) {
+                    // Will make a two-level subtree.
+                    // The deepest will copy member-by-member to build the structure to pass.
+                    // The level above that will be a two-operand EOpComma sequence that follows the copy by the
+                    // object itself.
+                    TVariable* internalAggregate = makeInternalVariable("aggShadow", *function[param].type);
+                    internalAggregate->getWritableType().getQualifier().makeTemporary();
+                    TIntermSymbol* internalSymbolNode = new TIntermSymbol(internalAggregate->getUniqueId(),
+                                                                          internalAggregate->getName(),
+                                                                          internalAggregate->getType());
+                    internalSymbolNode->setLoc(arg->getLoc());
+                    // This makes the deepest level, the member-wise copy
+                    TIntermAggregate* assignAgg = handleAssign(arg->getLoc(), EOpAssign, internalSymbolNode, arg)->getAsAggregate();
 
-                // Now, pair that with the resulting aggregate.
-                assignAgg = intermediate.growAggregate(assignAgg, internalSymbolNode, arg->getLoc());
-                assignAgg->setOperator(EOpComma);
-                assignAgg->setType(internalAggregate->getType());
-                setArg(i, assignAgg);
+                    // Now, pair that with the resulting aggregate.
+                    assignAgg = intermediate.growAggregate(assignAgg, internalSymbolNode, arg->getLoc());
+                    assignAgg->setOperator(EOpComma);
+                    assignAgg->setType(internalAggregate->getType());
+                    setArg(param, assignAgg);
+                }
             }
         }
     }
 }
 
+//
+// Add any needed implicit expansion of calling arguments from what the shader listed to what's
+// internally needed for the AST (given the constraints downstream).
+//
+void HlslParseContext::expandArguments(const TSourceLoc& loc, const TFunction& function, TIntermTyped*& arguments)
+{
+    TIntermAggregate* aggregate = arguments->getAsAggregate();
+    int functionParamNumberOffset = 0;
+
+    // Replace a single argument with a single argument.
+    const auto setArg = [&](int paramNum, TIntermTyped* arg) {
+        if (function.getParamCount() + functionParamNumberOffset == 1)
+            arguments = arg;
+        else {
+            if (aggregate == nullptr)
+                arguments = arg;
+            else
+                aggregate->getSequence()[paramNum] = arg;
+        }
+    };
+
+    // Replace a single argument with a list of arguments
+    const auto setArgList = [&](int paramNum, const TVector<TIntermTyped*>& args) {
+        if (args.size() == 1)
+            setArg(paramNum, args.front());
+        else {
+            if (function.getParamCount() + functionParamNumberOffset == 1) {
+                arguments = intermediate.makeAggregate(args.front());
+                std::for_each(args.begin() + 1, args.end(), 
+                    [&](TIntermTyped* arg) {
+                        arguments = intermediate.growAggregate(arguments, arg);
+                    });
+            } else {
+                auto it = aggregate->getSequence().erase(aggregate->getSequence().begin() + paramNum);
+                aggregate->getSequence().insert(it, args.begin(), args.end());
+            }
+        }
+        functionParamNumberOffset += (args.size() - 1);
+    };
+
+    // Process each argument's conversion
+    for (int param = 0; param < function.getParamCount(); ++param) {
+        // At this early point there is a slight ambiguity between whether an aggregate 'arguments'
+        // is the single argument itself or its children are the arguments.  Only one argument
+        // means take 'arguments' itself as the one argument.
+        TIntermTyped* arg = function.getParamCount() == 1
+                                   ? arguments->getAsTyped()
+                                   : (aggregate ? 
+                                        aggregate->getSequence()[param + functionParamNumberOffset]->getAsTyped() :
+                                        arguments->getAsTyped());
+
+        if (wasFlattened(arg) && shouldFlatten(*function[param].type)) {
+            // Need to pass the structure members instead of the structure.
+            TVector<TIntermTyped*> memberArgs;
+            for (int memb = 0; memb < (int)arg->getType().getStruct()->size(); ++memb) {
+                initFlattening();
+                memberArgs.push_back(flattenAccess(arg, memb));
+                finalizeFlattening();
+            }
+            setArgList(param + functionParamNumberOffset, memberArgs);
+        }
+    }
+
+    // TODO: if we need both hidden counter args (below) and struct expansion (above)
+    // the two algorithms need to be merged: Each assumes the list starts out 1:1 between
+    // parameters and arguments.
+
+    // If any argument is a pass-by-reference struct buffer with an associated counter
+    // buffer, we have to add another hidden parameter for that counter.
+    if (aggregate)
+        addStructBuffArguments(loc, aggregate);
+}
+
 //
 // Add any needed implicit output conversions for function-call arguments.  This
 // can require a new tree topology, complicated further by whether the function
@@ -4682,7 +4800,7 @@ void HlslParseContext::addStructBuffArguments(const TSourceLoc& loc, TIntermAggr
 
     TIntermSequence argsWithCounterBuffers;
 
-    for (int param=0; param<int(aggregate->getSequence().size()); ++param) {
+    for (int param = 0; param < int(aggregate->getSequence().size()); ++param) {
         argsWithCounterBuffers.push_back(aggregate->getSequence()[param]);
 
         if (hasStructBuffCounter(aggregate->getSequence()[param]->getAsTyped()->getType())) {
@@ -6769,7 +6887,7 @@ TIntermNode* HlslParseContext::declareVariable(const TSourceLoc& loc, const TStr
 
     inheritGlobalDefaults(type.getQualifier());
 
-    const bool flattenVar = shouldFlattenUniform(type);
+    const bool flattenVar = shouldFlatten(type);
 
     // correct IO in the type
     switch (type.getQualifier().storage) {
diff --git a/hlsl/hlslParseHelper.h b/hlsl/hlslParseHelper.h
index f7b293abb..eeba37e5e 100755
--- a/hlsl/hlslParseHelper.h
+++ b/hlsl/hlslParseHelper.h
@@ -96,6 +96,7 @@ public:
     void decomposeGeometryMethods(const TSourceLoc&, TIntermTyped*& node, TIntermNode* arguments);
     void pushFrontArguments(TIntermTyped* front, TIntermTyped*& arguments);
     void addInputArgumentConversions(const TFunction&, TIntermTyped*&);
+    void expandArguments(const TSourceLoc&, const TFunction&, TIntermTyped*&);
     TIntermTyped* addOutputArgumentConversions(const TFunction&, TIntermOperator&);
     void builtInOpCheck(const TSourceLoc&, const TFunction&, TIntermOperator&);
     TFunction* makeConstructorCall(const TSourceLoc&, const TType&);
@@ -236,13 +237,14 @@ protected:
 
     // Array and struct flattening
     TIntermTyped* flattenAccess(TIntermTyped* base, int member);
-    bool shouldFlattenUniform(const TType&) const;
+    TIntermTyped* flattenAccess(int uniqueId, int member, const TType&);
+    bool shouldFlatten(const TType&) const;
     bool wasFlattened(const TIntermTyped* node) const;
     bool wasFlattened(int id) const { return flattenMap.find(id) != flattenMap.end(); }
     int  addFlattenedMember(const TSourceLoc& loc, const TVariable&, const TType&, TFlattenData&, const TString& name, bool track);
     bool isFinalFlattening(const TType& type) const { return !(type.isStruct() || type.isArray()); }
 
-    // Structure splitting (splits interstage builtin types into its own struct)
+    // Structure splitting (splits interstage built-in types into its own struct)
     TIntermTyped* splitAccessStruct(const TSourceLoc& loc, TIntermTyped*& base, int& member);
     void splitAccessArray(const TSourceLoc& loc, TIntermTyped* base, TIntermTyped* index);
     TType& split(TType& type, TString name, const TType* outerStructType = nullptr);
diff --git a/hlsl/hlslParseables.cpp b/hlsl/hlslParseables.cpp
index e094aef9d..db67c39df 100755
--- a/hlsl/hlslParseables.cpp
+++ b/hlsl/hlslParseables.cpp
@@ -502,7 +502,7 @@ void TBuiltInParseablesHlsl::initialize(int /*version*/, EProfile /*profile*/, c
     static const EShLanguageMask EShLangAll    = EShLanguageMask(EShLangCount - 1);
 
     // These are the actual stage masks defined in the documentation, in case they are
-    // needed for furture validation.  For now, they are commented out, and set below
+    // needed for future validation.  For now, they are commented out, and set below
     // to EShLangAll, to allow any intrinsic to be used in any shader, which is legal
     // if it is not called.
     //