/*** Copyright (C) 2021 J Reece Wilson (a/k/a "Reece"). All rights reserved. File: AuCompression.hpp Date: 2021-6-17 Author: Reece ***/ #pragma once namespace Aurora::Compression { #if defined(AURORA_IS_SERVER) || defined(AURORA_16_KIB_IN_OUT_BUFFERS_32K) // servers can have a multiple of these per client, if streams are not to be recycled. // XMPP, for example, does not reset. zlib and others expect genuine compression streams. // HTTP, libdeflate, and others expect flush framesm and then for the stateless stream to be recreated each IO tick. // let's assume we are a server; // most of our IO is done under protocol stacks that'll handle almost all of the buffering, // changing this value by 2-8 will apply to the law of large numbers on a server, // changing this value by an order of magnitude is a cost of 10x memory per client (tho you can afford the memory these days), // the underlying compressor object may not even be accessing these temp[kChunkSize] optimizations - some can directly read from AuByteBuffers, // some compression libraries are internally alloc heavy, which delgates the usage area to malloc/free callbacks... // ...these, like zstd, demand almost zero buffered data. a frame fragment is almost nothing. // therefore, it makes sense to keep these values down as much as possible static const AuUInt64 kChunkSize = 16 * 1024; #elif defined(AURORA_IS_CLIENT) // for builds explicitly earmarked to go out to end users, 2x the usual buffer size static const AuUInt64 kChunkSize = 256 * 1024; #else // and general purpose. static const AuUInt64 kChunkSize = 128 * 1024; // also note: zlib docs state up to 256k is efficient, but you can start at around 16k // https://www.zlib.net/zlib_how.html Last modified 8 February 2023. approx 30 Oct 2004 - 11 Dec 2005. // general infrastructure and requirements havent changed in 2 decades. // in 2001 and 2021 you can afford to buy a server with a few tens of gigabytes of ram // in 2001 and 2021 you can afford to buy a gigabit switch // in 2001 and 2021 you're still running on deflate algorithms // in 2004 and 2024 you're still struggling to parallelize compression without N x tar balls on N x cores (N ~= 2-6 would be reasonable) // in 2012-2014 you could get yourself an ivybridge or a sandbridge, in 2024 you could get yourself an AMD page aligned atomic acker or Oxide Inside 14th gen. // with ~4th gen being, in theory, multiple single digits slower than 14th gen. and, yes, comparing 4th gen to 9th gen compilation times will result in 4-8x boosts, // on a very fundamental level, we havent moved that much in terms of single threaded performance. // that also applies to compression. it's stil the same old slow task calculating bit patterns, looking through big precomputed (or not) datasets, and making in-order & non-parallelizable changes // With hardware not progressing **that** much, the fundamental algorithms staying the exact same, and the same libraries being used, call me a bit insane to think maybe an article from 2005 isnt that far off from reality. // 16k*128k (clients) ~= 2gb (x2 for local in/out? [= 4GiB] x4 for additional in/out stream pairing? [= 8GiB]) of ram for a heavyweight server (plus many other user implementation overheads) // 16k per stream / 256MB on an embedded tablet or games console // 16k per stream / 256 - 2048MB per app / 1024 - 4096 per home appliance mid 2000s (laptop, pc, etc) // 16k per stream / 256 - 2048MB per app / 4096 - 32768 per home appliance mid 2020s (laptop, pc, etc) // 64k (2^16) IPv4 UDP ports, including system region and others // 2x IPv6 address = 128k // AAA software can see peaks of hundreds of thousands to millions per clients per platform, so x2, x10, x20, x100 this (more responsive stateful, proprietary, xmpp, etc protocols) // WebShid "technology" works on stateless packets, so your memory requirements are basically just this in the network stream buffers, and with the rest being temporary allocations (more expensive & wasteful for stream-like connections). // Seems fair...? seems like the world we should be still living in...? maybe we can 10x or 100x it no matter the era for increased throughput...? would zlib even care...? #endif // sometimes on stack static const AuUInt64 kChunkSize2 = 128 * 1024; /// * compression type + bits -> internal zlib windowBits static bool CompressionLevelFromExternalApi(const DecompressInfo &info, AuInt8 &out) { out = 0; if (!info.uOptWindowBits) { if (info.alg == ECompressionType::eGZip) { out = 15 | 16; } else if (info.alg == ECompressionType::eDeflate) { out = -15; } else { out = 15; } return true; } if (info.uOptWindowBits.value() < 0) { return {}; } if (info.uOptWindowBits.value() > 15) { return {}; } if (info.alg == ECompressionType::eGZip) { out = info.uOptWindowBits.value() | 16; } else if (info.alg == ECompressionType::eDeflate) { out = 0 - info.uOptWindowBits.value(); } else { out = info.uOptWindowBits.value(); } return true; } /// * compression type + bits -> internal zlib windowBits static bool CompressionLevelFromExternalApi(const CompressInfo &info, AuInt8 &out) { out = 0; if (!info.uOptWindowBits) { if (info.type == ECompressionType::eGZip) { out = 15 | 16; } else if (info.type == ECompressionType::eDeflate) { out = -15; } else { out = 15; } return true; } if (info.type == ECompressionType::eGZip) { out = info.uOptWindowBits.value() | 16; } else if (info.type == ECompressionType::eDeflate) { out = 0 - info.uOptWindowBits.value(); } else { out = info.uOptWindowBits.value(); } return true; } }