AuroraRuntime/Source/Compression/AuCompression.hpp
Jamie Reece Wilson 978693559e [*] QOL / Hardening / Optimizations / Bug fixes
(but not really. this is just some patchwork)
2024-10-02 00:47:44 +01:00

146 lines
6.3 KiB
C++

/***
Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved.
File: AuCompression.hpp
Date: 2021-6-17
Author: Reece
***/
#pragma once
namespace Aurora::Compression
{
#if defined(AURORA_IS_SERVER) || defined(AURORA_16_KIB_IN_OUT_BUFFERS_32K)
// servers can have a multiple of these per client, if streams are not to be recycled.
// XMPP, for example, does not reset. zlib and others expect genuine compression streams.
// HTTP, libdeflate, and others expect flush framesm and then for the stateless stream to be recreated each IO tick.
// let's assume we are a server;
// most of our IO is done under protocol stacks that'll handle almost all of the buffering,
// changing this value by 2-8 will apply to the law of large numbers on a server,
// changing this value by an order of magnitude is a cost of 10x memory per client (tho you can afford the memory these days),
// the underlying compressor object may not even be accessing these temp[kChunkSize] optimizations - some can directly read from AuByteBuffers,
// some compression libraries are internally alloc heavy, which delgates the usage area to malloc/free callbacks...
// ...these, like zstd, demand almost zero buffered data. a frame fragment is almost nothing.
// therefore, it makes sense to keep these values down as much as possible
static const AuUInt64 kChunkSize = 16 * 1024;
#elif defined(AURORA_IS_CLIENT)
// for builds explicitly earmarked to go out to end users, 2x the usual buffer size
static const AuUInt64 kChunkSize = 256 * 1024;
#else
// and general purpose.
static const AuUInt64 kChunkSize = 128 * 1024;
// also note: zlib docs state up to 256k is efficient, but you can start at around 16k
// https://www.zlib.net/zlib_how.html Last modified 8 February 2023. approx 30 Oct 2004 - 11 Dec 2005.
// general infrastructure and requirements havent changed in 2 decades.
// in 2001 and 2021 you can afford to buy a server with a few tens of gigabytes of ram
// in 2001 and 2021 you can afford to buy a gigabit switch
// in 2001 and 2021 you're still running on deflate algorithms
// in 2004 and 2024 you're still struggling to parallelize compression without N x tar balls on N x cores (N ~= 2-6 would be reasonable)
// in 2012-2014 you could get yourself an ivybridge or a sandbridge, in 2024 you could get yourself an AMD page aligned atomic acker or Oxide Inside 14th gen.
// with ~4th gen being, in theory, multiple single digits slower than 14th gen. and, yes, comparing 4th gen to 9th gen compilation times will result in 4-8x boosts,
// on a very fundamental level, we havent moved that much in terms of single threaded performance.
// that also applies to compression. it's stil the same old slow task calculating bit patterns, looking through big precomputed (or not) datasets, and making in-order & non-parallelizable changes
// With hardware not progressing **that** much, the fundamental algorithms staying the exact same, and the same libraries being used, call me a bit insane to think maybe an article from 2005 isnt that far off from reality.
// 16k*128k (clients) ~= 2gb (x2 for local in/out? [= 4GiB] x4 for additional in/out stream pairing? [= 8GiB]) of ram for a heavyweight server (plus many other user implementation overheads)
// 16k per stream / 256MB on an embedded tablet or games console
// 16k per stream / 256 - 2048MB per app / 1024 - 4096 per home appliance mid 2000s (laptop, pc, etc)
// 16k per stream / 256 - 2048MB per app / 4096 - 32768 per home appliance mid 2020s (laptop, pc, etc)
// 64k (2^16) IPv4 UDP ports, including system region and others
// 2x IPv6 address = 128k
// AAA software can see peaks of hundreds of thousands to millions per clients per platform, so x2, x10, x20, x100 this (more responsive stateful, proprietary, xmpp, etc protocols)
// WebShid "technology" works on stateless packets, so your memory requirements are basically just this in the network stream buffers, and with the rest being temporary allocations (more expensive & wasteful for stream-like connections).
// Seems fair...? seems like the world we should be still living in...? maybe we can 10x or 100x it no matter the era for increased throughput...? would zlib even care...?
#endif
// sometimes on stack
static const AuUInt64 kChunkSize2 = 128 * 1024;
/// * compression type + bits -> internal zlib windowBits
static bool CompressionLevelFromExternalApi(const DecompressInfo &info, AuInt8 &out)
{
out = 0;
if (!info.uOptWindowBits)
{
if (info.alg == ECompressionType::eGZip)
{
out = 15 | 16;
}
else if (info.alg == ECompressionType::eDeflate)
{
out = -15;
}
else
{
out = 15;
}
return true;
}
if (info.uOptWindowBits.value() < 0)
{
return {};
}
if (info.uOptWindowBits.value() > 15)
{
return {};
}
if (info.alg == ECompressionType::eGZip)
{
out = info.uOptWindowBits.value() | 16;
}
else if (info.alg == ECompressionType::eDeflate)
{
out = 0 - info.uOptWindowBits.value();
}
else
{
out = info.uOptWindowBits.value();
}
return true;
}
/// * compression type + bits -> internal zlib windowBits
static bool CompressionLevelFromExternalApi(const CompressInfo &info, AuInt8 &out)
{
out = 0;
if (!info.uOptWindowBits)
{
if (info.type == ECompressionType::eGZip)
{
out = 15 | 16;
}
else if (info.type == ECompressionType::eDeflate)
{
out = -15;
}
else
{
out = 15;
}
return true;
}
if (info.type == ECompressionType::eGZip)
{
out = info.uOptWindowBits.value() | 16;
}
else if (info.type == ECompressionType::eDeflate)
{
out = 0 - info.uOptWindowBits.value();
}
else
{
out = info.uOptWindowBits.value();
}
return true;
}
}