diff --git a/OGLCompilersDLL/InitializeDll.cpp b/OGLCompilersDLL/InitializeDll.cpp
index 2eb912c42..c1f03c09c 100644
--- a/OGLCompilersDLL/InitializeDll.cpp
+++ b/OGLCompilersDLL/InitializeDll.cpp
@@ -45,6 +45,10 @@ namespace glslang {
 
 OS_TLSIndex ThreadInitializeIndex = OS_INVALID_TLS_INDEX;
 
+// Per-process initialization.
+// Needs to be called at least once before parsing, etc. is done.
+// Will also do thread initialization for the calling thread; other
+// threads will need to do that explicitly.
 bool InitProcess()
 {
     glslang::GetGlobalLock();
@@ -85,7 +89,9 @@ bool InitProcess()
     return true;
 }
 
-
+// Per-thread scoped initialization.
+// Must be called at least once by each new thread sharing the
+// symbol tables, etc., needed to parse.
 bool InitThread()
 {
     //
@@ -109,7 +115,9 @@ bool InitThread()
     return true;
 }
 
-
+// Thread-scoped tear down. Needs to be done by all but the last
+// thread, which calls DetachProcess() instead.
+// NB. TODO: Not currently being executed by each thread.
 bool DetachThread()
 {
     bool success = true;
@@ -126,13 +134,13 @@ bool DetachThread()
             success = false;
         }
 
-        FreeGlobalPools();
-
+        FreeMemoryPools();
     }
 
     return success;
 }
 
+// Process-scoped tear down. Needs to be done by final thread in process.
 bool DetachProcess()
 {
     bool success = true;
diff --git a/OGLCompilersDLL/InitializeDll.h b/OGLCompilersDLL/InitializeDll.h
index 60b2b1593..281f3b1a9 100644
--- a/OGLCompilersDLL/InitializeDll.h
+++ b/OGLCompilersDLL/InitializeDll.h
@@ -40,7 +40,7 @@ namespace glslang {
 
 bool InitProcess();
 bool InitThread();
-bool DetachThread();
+bool DetachThread(); // TODO: use this or remove it; ideally make it unneeded
 bool DetachProcess();
 
 } // end namespace glslang
diff --git a/glslang/Include/InitializeGlobals.h b/glslang/Include/InitializeGlobals.h
index 4cf2dca7c..dd7554b62 100644
--- a/glslang/Include/InitializeGlobals.h
+++ b/glslang/Include/InitializeGlobals.h
@@ -38,7 +38,8 @@
 namespace glslang {
 
 void InitializeMemoryPools();
-void FreeGlobalPools();
+void FreeMemoryPools();
+
 bool InitializePoolIndex();
 void FreePoolIndex();
 
diff --git a/glslang/Include/PoolAlloc.h b/glslang/Include/PoolAlloc.h
index 69bacb156..0e237a6a2 100644
--- a/glslang/Include/PoolAlloc.h
+++ b/glslang/Include/PoolAlloc.h
@@ -250,15 +250,8 @@ private:
 // different times.  But a simple use is to have a global pop
 // with everyone using the same global allocator.
 //
-typedef TPoolAllocator* PoolAllocatorPointer;
 extern TPoolAllocator& GetThreadPoolAllocator();
-
-struct TThreadMemoryPools
-{
-    TPoolAllocator* threadPoolAllocator;
-};
-
-void SetThreadPoolAllocator(TPoolAllocator& poolAllocator);
+void SetThreadPoolAllocator(TPoolAllocator* poolAllocator);
 
 //
 // This STL compatible allocator is intended to be used as the allocator
diff --git a/glslang/MachineIndependent/PoolAlloc.cpp b/glslang/MachineIndependent/PoolAlloc.cpp
index 4007c3861..5077a26bc 100644
--- a/glslang/MachineIndependent/PoolAlloc.cpp
+++ b/glslang/MachineIndependent/PoolAlloc.cpp
@@ -40,35 +40,40 @@
 
 namespace glslang {
 
+// Process-wide TLS index
 OS_TLSIndex PoolIndex;
 
-void InitializeMemoryPools()
+// Per-thread structure holding pool pointers.
+struct TThreadMemoryPools
 {
-    TThreadMemoryPools* pools = static_cast<TThreadMemoryPools*>(OS_GetTLSValue(PoolIndex));
-    if (pools)
-        return;
+    TPoolAllocator* threadPoolAllocator; // the current pool
+};
 
-    TPoolAllocator *threadPoolAllocator = new TPoolAllocator();
-
-    TThreadMemoryPools* threadData = new TThreadMemoryPools();
-
-    threadData->threadPoolAllocator = threadPoolAllocator;
-
-    OS_SetTLSValue(PoolIndex, threadData);
+// Return the thread-specific pool pointers.
+TThreadMemoryPools* GetThreadMemoryPools()
+{
+    return static_cast<TThreadMemoryPools*>(OS_GetTLSValue(PoolIndex));
 }
 
-void FreeGlobalPools()
+// Set the thread-specific pool pointers.
+void SetThreadMemoryPools(TThreadMemoryPools* pools)
 {
-    // Release the allocated memory for this thread.
-    TThreadMemoryPools* globalPools = static_cast<TThreadMemoryPools*>(OS_GetTLSValue(PoolIndex));
-    if (! globalPools)
-        return;
-
-    GetThreadPoolAllocator().popAll();
-    delete &GetThreadPoolAllocator();
-    delete globalPools;
+    OS_SetTLSValue(PoolIndex, pools);
 }
 
+// Return the thread-specific current pool.
+TPoolAllocator& GetThreadPoolAllocator()
+{
+    return *GetThreadMemoryPools()->threadPoolAllocator;
+}
+
+// Set the thread-specific current pool.
+void SetThreadPoolAllocator(TPoolAllocator* poolAllocator)
+{
+    GetThreadMemoryPools()->threadPoolAllocator = poolAllocator;
+}
+
+// Process-wide set up of the TLS pool storage.
 bool InitializePoolIndex()
 {
     // Allocate a TLS index.
@@ -78,24 +83,30 @@ bool InitializePoolIndex()
     return true;
 }
 
+// Process-wide tear down of the TLS pool storage.
 void FreePoolIndex()
 {
     // Release the TLS index.
     OS_FreeTLSIndex(PoolIndex);
 }
 
-TPoolAllocator& GetThreadPoolAllocator()
+// Per-thread set up of the memory pools.
+void InitializeMemoryPools()
 {
-    TThreadMemoryPools* threadData = static_cast<TThreadMemoryPools*>(OS_GetTLSValue(PoolIndex));
-
-    return *threadData->threadPoolAllocator;
+    if (GetThreadMemoryPools() == nullptr) {
+        SetThreadMemoryPools(new TThreadMemoryPools());
+        SetThreadPoolAllocator(new TPoolAllocator());
+    }
 }
 
-void SetThreadPoolAllocator(TPoolAllocator& poolAllocator)
+// Per-thread tear down of the memory pools.
+void FreeMemoryPools()
 {
-    TThreadMemoryPools* threadData = static_cast<TThreadMemoryPools*>(OS_GetTLSValue(PoolIndex));
-
-    threadData->threadPoolAllocator = &poolAllocator;
+    if (GetThreadMemoryPools() != nullptr) {
+        GetThreadPoolAllocator().popAll();
+        delete &GetThreadPoolAllocator();
+        delete GetThreadMemoryPools();
+    }
 }
 
 //
diff --git a/glslang/MachineIndependent/ShaderLang.cpp b/glslang/MachineIndependent/ShaderLang.cpp
index c8e954ce0..8de5297be 100644
--- a/glslang/MachineIndependent/ShaderLang.cpp
+++ b/glslang/MachineIndependent/ShaderLang.cpp
@@ -217,7 +217,7 @@ enum EPrecisionClass {
 TSymbolTable* CommonSymbolTable[VersionCount][SpvVersionCount][ProfileCount][SourceCount][EPcCount] = {};
 TSymbolTable* SharedSymbolTables[VersionCount][SpvVersionCount][ProfileCount][SourceCount][EShLangCount] = {};
 
-TPoolAllocator* PerProcessGPA = 0;
+TPoolAllocator* PerProcessGPA = nullptr;
 
 //
 // Parse and add to the given symbol table the content of the given shader string.
@@ -361,7 +361,7 @@ bool AddContextSpecificSymbols(const TBuiltInResource* resources, TInfoSink& inf
 // pool allocator intact, so:
 //  - Switch to a new pool for parsing the built-ins
 //  - Do the parsing, which builds the symbol table, using the new pool
-//  - Switch to the process-global pool to save a copy the resulting symbol table
+//  - Switch to the process-global pool to save a copy of the resulting symbol table
 //  - Free up the new pool used to parse the built-ins
 //  - Switch back to the original thread's pool
 //
@@ -388,8 +388,8 @@ void SetupBuiltinSymbolTable(int version, EProfile profile, const SpvVersion& sp
 
     // Switch to a new pool
     TPoolAllocator& previousAllocator = GetThreadPoolAllocator();
-    TPoolAllocator* builtInPoolAllocator = new TPoolAllocator();
-    SetThreadPoolAllocator(*builtInPoolAllocator);
+    TPoolAllocator* builtInPoolAllocator = new TPoolAllocator;
+    SetThreadPoolAllocator(builtInPoolAllocator);
 
     // Dynamically allocate the local symbol tables so we can control when they are deallocated WRT when the pool is popped.
     TSymbolTable* commonTable[EPcCount];
@@ -403,7 +403,7 @@ void SetupBuiltinSymbolTable(int version, EProfile profile, const SpvVersion& sp
     InitializeSymbolTables(infoSink, commonTable, stageTables, version, profile, spvVersion, source);
 
     // Switch to the process-global pool
-    SetThreadPoolAllocator(*PerProcessGPA);
+    SetThreadPoolAllocator(PerProcessGPA);
 
     // Copy the local symbol tables from the new pool to the global tables using the process-global pool
     for (int precClass = 0; precClass < EPcCount; ++precClass) {
@@ -430,7 +430,7 @@ void SetupBuiltinSymbolTable(int version, EProfile profile, const SpvVersion& sp
         delete stageTables[stage];
 
     delete builtInPoolAllocator;
-    SetThreadPoolAllocator(previousAllocator);
+    SetThreadPoolAllocator(&previousAllocator);
 
     glslang::ReleaseGlobalLock();
 }
@@ -1196,7 +1196,7 @@ int ShInitialize()
     if (! InitProcess())
         return 0;
 
-    if (! PerProcessGPA)
+    if (PerProcessGPA == nullptr)
         PerProcessGPA = new TPoolAllocator();
 
     glslang::TScanContext::fillInKeywordMap();
@@ -1288,10 +1288,10 @@ int __fastcall ShFinalize()
         }
     }
 
-    if (PerProcessGPA) {
+    if (PerProcessGPA != nullptr) {
         PerProcessGPA->popAll();
         delete PerProcessGPA;
-        PerProcessGPA = 0;
+        PerProcessGPA = nullptr;
     }
 
     glslang::TScanContext::deleteKeywordMap();
@@ -1708,7 +1708,7 @@ bool TShader::parse(const TBuiltInResource* builtInResources, int defaultVersion
         return false;
 
     pool = new TPoolAllocator();
-    SetThreadPoolAllocator(*pool);
+    SetThreadPoolAllocator(pool);
     if (! preamble)
         preamble = "";
 
@@ -1732,7 +1732,7 @@ bool TShader::preprocess(const TBuiltInResource* builtInResources,
         return false;
 
     pool = new TPoolAllocator();
-    SetThreadPoolAllocator(*pool);
+    SetThreadPoolAllocator(pool);
     if (! preamble)
         preamble = "";
 
@@ -1789,7 +1789,7 @@ bool TProgram::link(EShMessages messages)
     bool error = false;
 
     pool = new TPoolAllocator();
-    SetThreadPoolAllocator(*pool);
+    SetThreadPoolAllocator(pool);
 
     for (int s = 0; s < EShLangCount; ++s) {
         if (! linkStage((EShLanguage)s, messages))
diff --git a/glslang/Public/ShaderLang.h b/glslang/Public/ShaderLang.h
index 6fadfbf0c..6e22bdd72 100644
--- a/glslang/Public/ShaderLang.h
+++ b/glslang/Public/ShaderLang.h
@@ -68,15 +68,14 @@
 #endif
 
 //
-// Driver must call this first, once, before doing any other
-// compiler/linker operations.
+// Call before doing any other compiler/linker operations.
 //
 // (Call once per process, not once per thread.)
 //
 SH_IMPORT_EXPORT int ShInitialize();
 
 //
-// Driver should call this at process shutdown.
+// Call this at process shutdown to clean up memory.
 //
 SH_IMPORT_EXPORT int __fastcall ShFinalize();
 
@@ -290,7 +289,7 @@ SH_IMPORT_EXPORT int ShGetUniformLocation(const ShHandle uniformMap, const char*
 // Deferred-Lowering C++ Interface
 // -----------------------------------
 //
-// Below is a new alternate C++ interface that might potentially replace the above
+// Below is a new alternate C++ interface, which deprecates the above
 // opaque handle-based interface.
 //
 // The below is further designed to handle multiple compilation units per stage, where