VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Common/lzma/LzFind.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/Common/lzma/LzFind.c')
-rw-r--r--src/Common/lzma/LzFind.c1717
1 files changed, 1717 insertions, 0 deletions
diff --git a/src/Common/lzma/LzFind.c b/src/Common/lzma/LzFind.c
new file mode 100644
index 00000000..0fbd5aae
--- /dev/null
+++ b/src/Common/lzma/LzFind.c
@@ -0,0 +1,1717 @@
+/* LzFind.c -- Match finder for LZ algorithms
+2023-03-14 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
+
+#include <string.h>
+// #include <stdio.h>
+
+#include "CpuArch.h"
+#include "LzFind.h"
+#include "LzHash.h"
+
+#define kBlockMoveAlign (1 << 7) // alignment for memmove()
+#define kBlockSizeAlign (1 << 16) // alignment for block allocation
+#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary
+
+#define kEmptyHashValue 0
+
+#define kMaxValForNormalize ((UInt32)0)
+// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
+
+// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
+
+#define GET_AVAIL_BYTES(p) \
+ Inline_MatchFinder_GetNumAvailableBytes(p)
+
+
+// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
+#define kFix5HashSize kFix4HashSize
+
+/*
+ HASH2_CALC:
+ if (hv) match, then cur[0] and cur[1] also match
+*/
+#define HASH2_CALC hv = GetUi16(cur);
+
+// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
+
+/*
+ HASH3_CALC:
+ if (cur[0]) and (h2) match, then cur[1] also match
+ if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
+*/
+#define HASH3_CALC { \
+ UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+ h2 = temp & (kHash2Size - 1); \
+ hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
+
+#define HASH4_CALC { \
+ UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+ h2 = temp & (kHash2Size - 1); \
+ temp ^= ((UInt32)cur[2] << 8); \
+ h3 = temp & (kHash3Size - 1); \
+ hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
+
+#define HASH5_CALC { \
+ UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+ h2 = temp & (kHash2Size - 1); \
+ temp ^= ((UInt32)cur[2] << 8); \
+ h3 = temp & (kHash3Size - 1); \
+ temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
+ /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
+ hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
+
+#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
+
+
+static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
+{
+ // if (!p->directInput)
+ {
+ ISzAlloc_Free(alloc, p->bufBase);
+ p->bufBase = NULL;
+ }
+}
+
+
+static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
+{
+ if (blockSize == 0)
+ return 0;
+ if (!p->bufBase || p->blockSize != blockSize)
+ {
+ // size_t blockSizeT;
+ LzInWindow_Free(p, alloc);
+ p->blockSize = blockSize;
+ // blockSizeT = blockSize;
+
+ // printf("\nblockSize = 0x%x\n", blockSize);
+ /*
+ #if defined _WIN64
+ // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
+ // we use UInt32 type for (p->blockSize), because
+ // we don't want to wrap over 4 GiB,
+ // when we use (p->streamPos - p->pos) that is UInt32.
+ if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
+ {
+ blockSizeT = ((size_t)1 << 32);
+ printf("\nchanged to blockSizeT = 4GiB\n");
+ }
+ #endif
+ */
+
+ p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
+ // printf("\nbufferBase = %p\n", p->bufBase);
+ // return 0; // for debug
+ }
+ return (p->bufBase != NULL);
+}
+
+static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
+
+static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
+
+
+Z7_NO_INLINE
+static void MatchFinder_ReadBlock(CMatchFinder *p)
+{
+ if (p->streamEndWasReached || p->result != SZ_OK)
+ return;
+
+ /* We use (p->streamPos - p->pos) value.
+ (p->streamPos < p->pos) is allowed. */
+
+ if (p->directInput)
+ {
+ UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
+ if (curSize > p->directInputRem)
+ curSize = (UInt32)p->directInputRem;
+ p->streamPos += curSize;
+ p->directInputRem -= curSize;
+ if (p->directInputRem == 0)
+ p->streamEndWasReached = 1;
+ return;
+ }
+
+ for (;;)
+ {
+ const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
+ size_t size = (size_t)(p->bufBase + p->blockSize - dest);
+ if (size == 0)
+ {
+ /* we call ReadBlock() after NeedMove() and MoveBlock().
+ NeedMove() and MoveBlock() povide more than (keepSizeAfter)
+ to the end of (blockSize).
+ So we don't execute this branch in normal code flow.
+ We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
+ */
+ // p->result = SZ_ERROR_FAIL; // we can show error here
+ return;
+ }
+
+ // #define kRead 3
+ // if (size > kRead) size = kRead; // for debug
+
+ /*
+ // we need cast (Byte *)dest.
+ #ifdef __clang__
+ #pragma GCC diagnostic ignored "-Wcast-qual"
+ #endif
+ */
+ p->result = ISeqInStream_Read(p->stream,
+ p->bufBase + (dest - p->bufBase), &size);
+ if (p->result != SZ_OK)
+ return;
+ if (size == 0)
+ {
+ p->streamEndWasReached = 1;
+ return;
+ }
+ p->streamPos += (UInt32)size;
+ if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
+ return;
+ /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
+ (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
+ }
+
+ // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
+}
+
+
+
+Z7_NO_INLINE
+void MatchFinder_MoveBlock(CMatchFinder *p)
+{
+ const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
+ const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
+ p->buffer = p->bufBase + keepBefore;
+ memmove(p->bufBase,
+ p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
+ keepBefore + (size_t)GET_AVAIL_BYTES(p));
+}
+
+/* We call MoveBlock() before ReadBlock().
+ So MoveBlock() can be wasteful operation, if the whole input data
+ can fit in current block even without calling MoveBlock().
+ in important case where (dataSize <= historySize)
+ condition (p->blockSize > dataSize + p->keepSizeAfter) is met
+ So there is no MoveBlock() in that case case.
+*/
+
+int MatchFinder_NeedMove(CMatchFinder *p)
+{
+ if (p->directInput)
+ return 0;
+ if (p->streamEndWasReached || p->result != SZ_OK)
+ return 0;
+ return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
+}
+
+void MatchFinder_ReadIfRequired(CMatchFinder *p)
+{
+ if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
+ MatchFinder_ReadBlock(p);
+}
+
+
+
+static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
+{
+ p->cutValue = 32;
+ p->btMode = 1;
+ p->numHashBytes = 4;
+ p->numHashBytes_Min = 2;
+ p->numHashOutBits = 0;
+ p->bigHash = 0;
+}
+
+#define kCrcPoly 0xEDB88320
+
+void MatchFinder_Construct(CMatchFinder *p)
+{
+ unsigned i;
+ p->buffer = NULL;
+ p->bufBase = NULL;
+ p->directInput = 0;
+ p->stream = NULL;
+ p->hash = NULL;
+ p->expectedDataSize = (UInt64)(Int64)-1;
+ MatchFinder_SetDefaultSettings(p);
+
+ for (i = 0; i < 256; i++)
+ {
+ UInt32 r = (UInt32)i;
+ unsigned j;
+ for (j = 0; j < 8; j++)
+ r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
+ p->crc[i] = r;
+ }
+}
+
+#undef kCrcPoly
+
+static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
+{
+ ISzAlloc_Free(alloc, p->hash);
+ p->hash = NULL;
+}
+
+void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
+{
+ MatchFinder_FreeThisClassMemory(p, alloc);
+ LzInWindow_Free(p, alloc);
+}
+
+static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
+{
+ const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
+ if (sizeInBytes / sizeof(CLzRef) != num)
+ return NULL;
+ return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
+}
+
+#if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
+ #error Stop_Compiling_Bad_Reserve
+#endif
+
+
+
+static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
+{
+ UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
+ /*
+ if (historySize > kMaxHistorySize)
+ return 0;
+ */
+ // printf("\nhistorySize == 0x%x\n", historySize);
+
+ if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow
+ return 0;
+
+ {
+ const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
+ const UInt32 rem = kBlockSizeMax - blockSize;
+ const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
+ + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
+ if (blockSize >= kBlockSizeMax
+ || rem < kBlockSizeReserveMin) // we reject settings that will be slow
+ return 0;
+ if (reserve >= rem)
+ blockSize = kBlockSizeMax;
+ else
+ {
+ blockSize += reserve;
+ blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
+ }
+ }
+ // printf("\n LzFind_blockSize = %x\n", blockSize);
+ // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
+ return blockSize;
+}
+
+
+// input is historySize
+static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
+{
+ if (p->numHashBytes == 2)
+ return (1 << 16) - 1;
+ if (hs != 0)
+ hs--;
+ hs |= (hs >> 1);
+ hs |= (hs >> 2);
+ hs |= (hs >> 4);
+ hs |= (hs >> 8);
+ // we propagated 16 bits in (hs). Low 16 bits must be set later
+ if (hs >= (1 << 24))
+ {
+ if (p->numHashBytes == 3)
+ hs = (1 << 24) - 1;
+ /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
+ }
+ // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+ hs |= (1 << 16) - 1; /* don't change it! */
+ // bt5: we adjust the size with recommended minimum size
+ if (p->numHashBytes >= 5)
+ hs |= (256 << kLzHash_CrcShift_2) - 1;
+ return hs;
+}
+
+// input is historySize
+static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
+{
+ if (p->numHashBytes == 2)
+ return (1 << 16) - 1;
+ if (hs != 0)
+ hs--;
+ hs |= (hs >> 1);
+ hs |= (hs >> 2);
+ hs |= (hs >> 4);
+ hs |= (hs >> 8);
+ // we propagated 16 bits in (hs). Low 16 bits must be set later
+ hs >>= 1;
+ if (hs >= (1 << 24))
+ {
+ if (p->numHashBytes == 3)
+ hs = (1 << 24) - 1;
+ else
+ hs >>= 1;
+ /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
+ }
+ // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+ hs |= (1 << 16) - 1; /* don't change it! */
+ // bt5: we adjust the size with recommended minimum size
+ if (p->numHashBytes >= 5)
+ hs |= (256 << kLzHash_CrcShift_2) - 1;
+ return hs;
+}
+
+
+int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
+ UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
+ ISzAllocPtr alloc)
+{
+ /* we need one additional byte in (p->keepSizeBefore),
+ since we use MoveBlock() after (p->pos++) and before dictionary using */
+ // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
+ p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
+
+ keepAddBufferAfter += matchMaxLen;
+ /* we need (p->keepSizeAfter >= p->numHashBytes) */
+ if (keepAddBufferAfter < p->numHashBytes)
+ keepAddBufferAfter = p->numHashBytes;
+ // keepAddBufferAfter -= 2; // for debug
+ p->keepSizeAfter = keepAddBufferAfter;
+
+ if (p->directInput)
+ p->blockSize = 0;
+ if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
+ {
+ size_t hashSizeSum;
+ {
+ UInt32 hs;
+ UInt32 hsCur;
+
+ if (p->numHashOutBits != 0)
+ {
+ unsigned numBits = p->numHashOutBits;
+ const unsigned nbMax =
+ (p->numHashBytes == 2 ? 16 :
+ (p->numHashBytes == 3 ? 24 : 32));
+ if (numBits > nbMax)
+ numBits = nbMax;
+ if (numBits >= 32)
+ hs = (UInt32)0 - 1;
+ else
+ hs = ((UInt32)1 << numBits) - 1;
+ // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+ hs |= (1 << 16) - 1; /* don't change it! */
+ if (p->numHashBytes >= 5)
+ hs |= (256 << kLzHash_CrcShift_2) - 1;
+ {
+ const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
+ if (hs > hs2)
+ hs = hs2;
+ }
+ hsCur = hs;
+ if (p->expectedDataSize < historySize)
+ {
+ const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
+ if (hsCur > hs2)
+ hsCur = hs2;
+ }
+ }
+ else
+ {
+ hs = MatchFinder_GetHashMask(p, historySize);
+ hsCur = hs;
+ if (p->expectedDataSize < historySize)
+ {
+ hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
+ if (hsCur > hs) // is it possible?
+ hsCur = hs;
+ }
+ }
+
+ p->hashMask = hsCur;
+
+ hashSizeSum = hs;
+ hashSizeSum++;
+ if (hashSizeSum < hs)
+ return 0;
+ {
+ UInt32 fixedHashSize = 0;
+ if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
+ if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
+ // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
+ hashSizeSum += fixedHashSize;
+ p->fixedHashSize = fixedHashSize;
+ }
+ }
+
+ p->matchMaxLen = matchMaxLen;
+
+ {
+ size_t newSize;
+ size_t numSons;
+ const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
+ p->historySize = historySize;
+ p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
+
+ numSons = newCyclicBufferSize;
+ if (p->btMode)
+ numSons <<= 1;
+ newSize = hashSizeSum + numSons;
+
+ if (numSons < newCyclicBufferSize || newSize < numSons)
+ return 0;
+
+ // aligned size is not required here, but it can be better for some loops
+ #define NUM_REFS_ALIGN_MASK 0xF
+ newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
+
+ // 22.02: we don't reallocate buffer, if old size is enough
+ if (p->hash && p->numRefs >= newSize)
+ return 1;
+
+ MatchFinder_FreeThisClassMemory(p, alloc);
+ p->numRefs = newSize;
+ p->hash = AllocRefs(newSize, alloc);
+
+ if (p->hash)
+ {
+ p->son = p->hash + hashSizeSum;
+ return 1;
+ }
+ }
+ }
+
+ MatchFinder_Free(p, alloc);
+ return 0;
+}
+
+
+static void MatchFinder_SetLimits(CMatchFinder *p)
+{
+ UInt32 k;
+ UInt32 n = kMaxValForNormalize - p->pos;
+ if (n == 0)
+ n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
+
+ k = p->cyclicBufferSize - p->cyclicBufferPos;
+ if (k < n)
+ n = k;
+
+ k = GET_AVAIL_BYTES(p);
+ {
+ const UInt32 ksa = p->keepSizeAfter;
+ UInt32 mm = p->matchMaxLen;
+ if (k > ksa)
+ k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
+ else if (k >= mm)
+ {
+ // the limitation for (p->lenLimit) update
+ k -= mm; // optimization : to reduce the number of checks
+ k++;
+ // k = 1; // non-optimized version : for debug
+ }
+ else
+ {
+ mm = k;
+ if (k != 0)
+ k = 1;
+ }
+ p->lenLimit = mm;
+ }
+ if (k < n)
+ n = k;
+
+ p->posLimit = p->pos + n;
+}
+
+
+void MatchFinder_Init_LowHash(CMatchFinder *p)
+{
+ size_t i;
+ CLzRef *items = p->hash;
+ const size_t numItems = p->fixedHashSize;
+ for (i = 0; i < numItems; i++)
+ items[i] = kEmptyHashValue;
+}
+
+
+void MatchFinder_Init_HighHash(CMatchFinder *p)
+{
+ size_t i;
+ CLzRef *items = p->hash + p->fixedHashSize;
+ const size_t numItems = (size_t)p->hashMask + 1;
+ for (i = 0; i < numItems; i++)
+ items[i] = kEmptyHashValue;
+}
+
+
+void MatchFinder_Init_4(CMatchFinder *p)
+{
+ if (!p->directInput)
+ p->buffer = p->bufBase;
+ {
+ /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
+ the code in CMatchFinderMt expects (pos = 1) */
+ p->pos =
+ p->streamPos =
+ 1; // it's smallest optimal value. do not change it
+ // 0; // for debug
+ }
+ p->result = SZ_OK;
+ p->streamEndWasReached = 0;
+}
+
+
+// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
+#define CYC_TO_POS_OFFSET 0
+// #define CYC_TO_POS_OFFSET 1 // for debug
+
+void MatchFinder_Init(CMatchFinder *p)
+{
+ MatchFinder_Init_HighHash(p);
+ MatchFinder_Init_LowHash(p);
+ MatchFinder_Init_4(p);
+ // if (readData)
+ MatchFinder_ReadBlock(p);
+
+ /* if we init (cyclicBufferPos = pos), then we can use one variable
+ instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
+ p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
+ // p->cyclicBufferPos = 0; // smallest value
+ // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
+ MatchFinder_SetLimits(p);
+}
+
+
+
+#ifdef MY_CPU_X86_OR_AMD64
+ #if defined(__clang__) && (__clang_major__ >= 4) \
+ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
+ // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
+
+ #define USE_LZFIND_SATUR_SUB_128
+ #define USE_LZFIND_SATUR_SUB_256
+ #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
+ #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2")))
+ #elif defined(_MSC_VER)
+ #if (_MSC_VER >= 1600)
+ #define USE_LZFIND_SATUR_SUB_128
+ #endif
+ #if (_MSC_VER >= 1900)
+ #define USE_LZFIND_SATUR_SUB_256
+ #endif
+ #endif
+
+// #elif defined(MY_CPU_ARM_OR_ARM64)
+#elif defined(MY_CPU_ARM64)
+
+ #if defined(__clang__) && (__clang_major__ >= 8) \
+ || defined(__GNUC__) && (__GNUC__ >= 8)
+ #define USE_LZFIND_SATUR_SUB_128
+ #ifdef MY_CPU_ARM64
+ // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
+ #else
+ // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
+ #endif
+
+ #elif defined(_MSC_VER)
+ #if (_MSC_VER >= 1910)
+ #define USE_LZFIND_SATUR_SUB_128
+ #endif
+ #endif
+
+ #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
+ #include <arm64_neon.h>
+ #else
+ #include <arm_neon.h>
+ #endif
+
+#endif
+
+
+#ifdef USE_LZFIND_SATUR_SUB_128
+
+// #define Z7_SHOW_HW_STATUS
+
+#ifdef Z7_SHOW_HW_STATUS
+#include <stdio.h>
+#define PRF(x) x
+PRF(;)
+#else
+#define PRF(x)
+#endif
+
+
+#ifdef MY_CPU_ARM_OR_ARM64
+
+#ifdef MY_CPU_ARM64
+// #define FORCE_LZFIND_SATUR_SUB_128
+#endif
+typedef uint32x4_t LzFind_v128;
+#define SASUB_128_V(v, s) \
+ vsubq_u32(vmaxq_u32(v, s), s)
+
+#else // MY_CPU_ARM_OR_ARM64
+
+#include <smmintrin.h> // sse4.1
+
+typedef __m128i LzFind_v128;
+// SSE 4.1
+#define SASUB_128_V(v, s) \
+ _mm_sub_epi32(_mm_max_epu32(v, s), s)
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+#define SASUB_128(i) \
+ *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \
+ *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
+
+
+Z7_NO_INLINE
+static
+#ifdef LZFIND_ATTRIB_SSE41
+LZFIND_ATTRIB_SSE41
+#endif
+void
+Z7_FASTCALL
+LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
+{
+ const LzFind_v128 sub2 =
+ #ifdef MY_CPU_ARM_OR_ARM64
+ vdupq_n_u32(subValue);
+ #else
+ _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
+ #endif
+ Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+ do
+ {
+ SASUB_128(0) SASUB_128(1) items += 2 * 4;
+ SASUB_128(0) SASUB_128(1) items += 2 * 4;
+ }
+ while (items != lim);
+}
+
+
+
+#ifdef USE_LZFIND_SATUR_SUB_256
+
+#include <immintrin.h> // avx
+/*
+clang :immintrin.h uses
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX2__)
+#include <avx2intrin.h>
+#endif
+so we need <avxintrin.h> for clang-cl */
+
+#if defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#endif
+
+// AVX2:
+#define SASUB_256(i) \
+ *( __m256i *)( void *)(items + (i) * 8) = \
+ _mm256_sub_epi32(_mm256_max_epu32( \
+ *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
+
+Z7_NO_INLINE
+static
+#ifdef LZFIND_ATTRIB_AVX2
+LZFIND_ATTRIB_AVX2
+#endif
+void
+Z7_FASTCALL
+LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
+{
+ const __m256i sub2 = _mm256_set_epi32(
+ (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
+ (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
+ Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+ do
+ {
+ SASUB_256(0) SASUB_256(1) items += 2 * 8;
+ SASUB_256(0) SASUB_256(1) items += 2 * 8;
+ }
+ while (items != lim);
+}
+#endif // USE_LZFIND_SATUR_SUB_256
+
+#ifndef FORCE_LZFIND_SATUR_SUB_128
+typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
+ UInt32 subValue, CLzRef *items, const CLzRef *lim);
+static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
+#endif // FORCE_LZFIND_SATUR_SUB_128
+
+#endif // USE_LZFIND_SATUR_SUB_128
+
+
+// kEmptyHashValue must be zero
+// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; }
+#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; }
+
+#ifdef FORCE_LZFIND_SATUR_SUB_128
+
+#define DEFAULT_SaturSub LzFind_SaturSub_128
+
+#else
+
+#define DEFAULT_SaturSub LzFind_SaturSub_32
+
+Z7_NO_INLINE
+static
+void
+Z7_FASTCALL
+LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
+{
+ Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+ do
+ {
+ SASUB_32(0) SASUB_32(1) items += 2;
+ SASUB_32(0) SASUB_32(1) items += 2;
+ SASUB_32(0) SASUB_32(1) items += 2;
+ SASUB_32(0) SASUB_32(1) items += 2;
+ }
+ while (items != lim);
+}
+
+#endif
+
+
+Z7_NO_INLINE
+void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
+{
+ #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
+ Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+ for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
+ {
+ SASUB_32(0)
+ items++;
+ }
+ {
+ const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
+ CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
+ numItems &= k_Align_Mask;
+ if (items != lim)
+ {
+ #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
+ if (g_LzFind_SaturSub)
+ g_LzFind_SaturSub(subValue, items, lim);
+ else
+ #endif
+ DEFAULT_SaturSub(subValue, items, lim);
+ }
+ items = lim;
+ }
+ Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+ for (; numItems != 0; numItems--)
+ {
+ SASUB_32(0)
+ items++;
+ }
+}
+
+
+
+// call MatchFinder_CheckLimits() only after (p->pos++) update
+
+Z7_NO_INLINE
+static void MatchFinder_CheckLimits(CMatchFinder *p)
+{
+ if (// !p->streamEndWasReached && p->result == SZ_OK &&
+ p->keepSizeAfter == GET_AVAIL_BYTES(p))
+ {
+ // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
+ if (MatchFinder_NeedMove(p))
+ MatchFinder_MoveBlock(p);
+ MatchFinder_ReadBlock(p);
+ }
+
+ if (p->pos == kMaxValForNormalize)
+ if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
+ /*
+ if we disable normalization for last bytes of data, and
+ if (data_size == 4 GiB), we don't call wastfull normalization,
+ but (pos) will be wrapped over Zero (0) in that case.
+ And we cannot resume later to normal operation
+ */
+ {
+ // MatchFinder_Normalize(p);
+ /* after normalization we need (p->pos >= p->historySize + 1); */
+ /* we can reduce subValue to aligned value, if want to keep alignment
+ of (p->pos) and (p->buffer) for speculated accesses. */
+ const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
+ // const UInt32 subValue = (1 << 15); // for debug
+ // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
+ MatchFinder_REDUCE_OFFSETS(p, subValue)
+ MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
+ {
+ size_t numSonRefs = p->cyclicBufferSize;
+ if (p->btMode)
+ numSonRefs <<= 1;
+ MatchFinder_Normalize3(subValue, p->son, numSonRefs);
+ }
+ }
+
+ if (p->cyclicBufferPos == p->cyclicBufferSize)
+ p->cyclicBufferPos = 0;
+
+ MatchFinder_SetLimits(p);
+}
+
+
+/*
+ (lenLimit > maxLen)
+*/
+Z7_FORCE_INLINE
+static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
+ UInt32 *d, unsigned maxLen)
+{
+ /*
+ son[_cyclicBufferPos] = curMatch;
+ for (;;)
+ {
+ UInt32 delta = pos - curMatch;
+ if (cutValue-- == 0 || delta >= _cyclicBufferSize)
+ return d;
+ {
+ const Byte *pb = cur - delta;
+ curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+ if (pb[maxLen] == cur[maxLen] && *pb == *cur)
+ {
+ UInt32 len = 0;
+ while (++len != lenLimit)
+ if (pb[len] != cur[len])
+ break;
+ if (maxLen < len)
+ {
+ maxLen = len;
+ *d++ = len;
+ *d++ = delta - 1;
+ if (len == lenLimit)
+ return d;
+ }
+ }
+ }
+ }
+ */
+
+ const Byte *lim = cur + lenLimit;
+ son[_cyclicBufferPos] = curMatch;
+
+ do
+ {
+ UInt32 delta;
+
+ if (curMatch == 0)
+ break;
+ // if (curMatch2 >= curMatch) return NULL;
+ delta = pos - curMatch;
+ if (delta >= _cyclicBufferSize)
+ break;
+ {
+ ptrdiff_t diff;
+ curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+ diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+ if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
+ {
+ const Byte *c = cur;
+ while (*c == c[diff])
+ {
+ if (++c == lim)
+ {
+ d[0] = (UInt32)(lim - cur);
+ d[1] = delta - 1;
+ return d + 2;
+ }
+ }
+ {
+ const unsigned len = (unsigned)(c - cur);
+ if (maxLen < len)
+ {
+ maxLen = len;
+ d[0] = (UInt32)len;
+ d[1] = delta - 1;
+ d += 2;
+ }
+ }
+ }
+ }
+ }
+ while (--cutValue);
+
+ return d;
+}
+
+
+Z7_FORCE_INLINE
+UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
+ UInt32 *d, UInt32 maxLen)
+{
+ CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+ unsigned len0 = 0, len1 = 0;
+
+ UInt32 cmCheck;
+
+ // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
+
+ cmCheck = (UInt32)(pos - _cyclicBufferSize);
+ if ((UInt32)pos <= _cyclicBufferSize)
+ cmCheck = 0;
+
+ if (cmCheck < curMatch)
+ do
+ {
+ const UInt32 delta = pos - curMatch;
+ {
+ CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+ const Byte *pb = cur - delta;
+ unsigned len = (len0 < len1 ? len0 : len1);
+ const UInt32 pair0 = pair[0];
+ if (pb[len] == cur[len])
+ {
+ if (++len != lenLimit && pb[len] == cur[len])
+ while (++len != lenLimit)
+ if (pb[len] != cur[len])
+ break;
+ if (maxLen < len)
+ {
+ maxLen = (UInt32)len;
+ *d++ = (UInt32)len;
+ *d++ = delta - 1;
+ if (len == lenLimit)
+ {
+ *ptr1 = pair0;
+ *ptr0 = pair[1];
+ return d;
+ }
+ }
+ }
+ if (pb[len] < cur[len])
+ {
+ *ptr1 = curMatch;
+ // const UInt32 curMatch2 = pair[1];
+ // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
+ // curMatch = curMatch2;
+ curMatch = pair[1];
+ ptr1 = pair + 1;
+ len1 = len;
+ }
+ else
+ {
+ *ptr0 = curMatch;
+ curMatch = pair[0];
+ ptr0 = pair;
+ len0 = len;
+ }
+ }
+ }
+ while(--cutValue && cmCheck < curMatch);
+
+ *ptr0 = *ptr1 = kEmptyHashValue;
+ return d;
+}
+
+
+static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
+{
+ CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+ unsigned len0 = 0, len1 = 0;
+
+ UInt32 cmCheck;
+
+ cmCheck = (UInt32)(pos - _cyclicBufferSize);
+ if ((UInt32)pos <= _cyclicBufferSize)
+ cmCheck = 0;
+
+ if (// curMatch >= pos || // failure
+ cmCheck < curMatch)
+ do
+ {
+ const UInt32 delta = pos - curMatch;
+ {
+ CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+ const Byte *pb = cur - delta;
+ unsigned len = (len0 < len1 ? len0 : len1);
+ if (pb[len] == cur[len])
+ {
+ while (++len != lenLimit)
+ if (pb[len] != cur[len])
+ break;
+ {
+ if (len == lenLimit)
+ {
+ *ptr1 = pair[0];
+ *ptr0 = pair[1];
+ return;
+ }
+ }
+ }
+ if (pb[len] < cur[len])
+ {
+ *ptr1 = curMatch;
+ curMatch = pair[1];
+ ptr1 = pair + 1;
+ len1 = len;
+ }
+ else
+ {
+ *ptr0 = curMatch;
+ curMatch = pair[0];
+ ptr0 = pair;
+ len0 = len;
+ }
+ }
+ }
+ while(--cutValue && cmCheck < curMatch);
+
+ *ptr0 = *ptr1 = kEmptyHashValue;
+ return;
+}
+
+
+#define MOVE_POS \
+ ++p->cyclicBufferPos; \
+ p->buffer++; \
+ { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
+
+#define MOVE_POS_RET MOVE_POS return distances;
+
+Z7_NO_INLINE
+static void MatchFinder_MovePos(CMatchFinder *p)
+{
+ /* we go here at the end of stream data, when (avail < num_hash_bytes)
+ We don't update sons[cyclicBufferPos << btMode].
+ So (sons) record will contain junk. And we cannot resume match searching
+ to normal operation, even if we will provide more input data in buffer.
+ p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue
+ if (p->btMode)
+ p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue
+ */
+ MOVE_POS
+}
+
+#define GET_MATCHES_HEADER2(minLen, ret_op) \
+ unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
+ lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
+ cur = p->buffer;
+
+#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
+#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue)
+
+#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
+
+#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num);
+
+#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
+ distances = func(MF_PARAMS(p), \
+ distances, (UInt32)_maxLen_); MOVE_POS_RET
+
+#define GET_MATCHES_FOOTER_BT(_maxLen_) \
+ GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
+
+#define GET_MATCHES_FOOTER_HC(_maxLen_) \
+ GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
+
+
+
+#define UPDATE_maxLen { \
+ const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
+ const Byte *c = cur + maxLen; \
+ const Byte *lim = cur + lenLimit; \
+ for (; c != lim; c++) if (*(c + diff) != *c) break; \
+ maxLen = (unsigned)(c - cur); }
+
+static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ GET_MATCHES_HEADER(2)
+ HASH2_CALC
+ curMatch = p->hash[hv];
+ p->hash[hv] = p->pos;
+ GET_MATCHES_FOOTER_BT(1)
+}
+
+UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ GET_MATCHES_HEADER(3)
+ HASH_ZIP_CALC
+ curMatch = p->hash[hv];
+ p->hash[hv] = p->pos;
+ GET_MATCHES_FOOTER_BT(2)
+}
+
+
+#define SET_mmm \
+ mmm = p->cyclicBufferSize; \
+ if (pos < mmm) \
+ mmm = pos;
+
+
+static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ UInt32 mmm;
+ UInt32 h2, d2, pos;
+ unsigned maxLen;
+ UInt32 *hash;
+ GET_MATCHES_HEADER(3)
+
+ HASH3_CALC
+
+ hash = p->hash;
+ pos = p->pos;
+
+ d2 = pos - hash[h2];
+
+ curMatch = (hash + kFix3HashSize)[hv];
+
+ hash[h2] = pos;
+ (hash + kFix3HashSize)[hv] = pos;
+
+ SET_mmm
+
+ maxLen = 2;
+
+ if (d2 < mmm && *(cur - d2) == *cur)
+ {
+ UPDATE_maxLen
+ distances[0] = (UInt32)maxLen;
+ distances[1] = d2 - 1;
+ distances += 2;
+ if (maxLen == lenLimit)
+ {
+ SkipMatchesSpec(MF_PARAMS(p));
+ MOVE_POS_RET
+ }
+ }
+
+ GET_MATCHES_FOOTER_BT(maxLen)
+}
+
+
+static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ UInt32 mmm;
+ UInt32 h2, h3, d2, d3, pos;
+ unsigned maxLen;
+ UInt32 *hash;
+ GET_MATCHES_HEADER(4)
+
+ HASH4_CALC
+
+ hash = p->hash;
+ pos = p->pos;
+
+ d2 = pos - hash [h2];
+ d3 = pos - (hash + kFix3HashSize)[h3];
+ curMatch = (hash + kFix4HashSize)[hv];
+
+ hash [h2] = pos;
+ (hash + kFix3HashSize)[h3] = pos;
+ (hash + kFix4HashSize)[hv] = pos;
+
+ SET_mmm
+
+ maxLen = 3;
+
+ for (;;)
+ {
+ if (d2 < mmm && *(cur - d2) == *cur)
+ {
+ distances[0] = 2;
+ distances[1] = d2 - 1;
+ distances += 2;
+ if (*(cur - d2 + 2) == cur[2])
+ {
+ // distances[-2] = 3;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ d2 = d3;
+ distances[1] = d3 - 1;
+ distances += 2;
+ }
+ else
+ break;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ d2 = d3;
+ distances[1] = d3 - 1;
+ distances += 2;
+ }
+ else
+ break;
+
+ UPDATE_maxLen
+ distances[-2] = (UInt32)maxLen;
+ if (maxLen == lenLimit)
+ {
+ SkipMatchesSpec(MF_PARAMS(p));
+ MOVE_POS_RET
+ }
+ break;
+ }
+
+ GET_MATCHES_FOOTER_BT(maxLen)
+}
+
+
+static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ UInt32 mmm;
+ UInt32 h2, h3, d2, d3, maxLen, pos;
+ UInt32 *hash;
+ GET_MATCHES_HEADER(5)
+
+ HASH5_CALC
+
+ hash = p->hash;
+ pos = p->pos;
+
+ d2 = pos - hash [h2];
+ d3 = pos - (hash + kFix3HashSize)[h3];
+ // d4 = pos - (hash + kFix4HashSize)[h4];
+
+ curMatch = (hash + kFix5HashSize)[hv];
+
+ hash [h2] = pos;
+ (hash + kFix3HashSize)[h3] = pos;
+ // (hash + kFix4HashSize)[h4] = pos;
+ (hash + kFix5HashSize)[hv] = pos;
+
+ SET_mmm
+
+ maxLen = 4;
+
+ for (;;)
+ {
+ if (d2 < mmm && *(cur - d2) == *cur)
+ {
+ distances[0] = 2;
+ distances[1] = d2 - 1;
+ distances += 2;
+ if (*(cur - d2 + 2) == cur[2])
+ {
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ distances[1] = d3 - 1;
+ distances += 2;
+ d2 = d3;
+ }
+ else
+ break;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ distances[1] = d3 - 1;
+ distances += 2;
+ d2 = d3;
+ }
+ else
+ break;
+
+ distances[-2] = 3;
+ if (*(cur - d2 + 3) != cur[3])
+ break;
+ UPDATE_maxLen
+ distances[-2] = (UInt32)maxLen;
+ if (maxLen == lenLimit)
+ {
+ SkipMatchesSpec(MF_PARAMS(p));
+ MOVE_POS_RET
+ }
+ break;
+ }
+
+ GET_MATCHES_FOOTER_BT(maxLen)
+}
+
+
+static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ UInt32 mmm;
+ UInt32 h2, h3, d2, d3, pos;
+ unsigned maxLen;
+ UInt32 *hash;
+ GET_MATCHES_HEADER(4)
+
+ HASH4_CALC
+
+ hash = p->hash;
+ pos = p->pos;
+
+ d2 = pos - hash [h2];
+ d3 = pos - (hash + kFix3HashSize)[h3];
+ curMatch = (hash + kFix4HashSize)[hv];
+
+ hash [h2] = pos;
+ (hash + kFix3HashSize)[h3] = pos;
+ (hash + kFix4HashSize)[hv] = pos;
+
+ SET_mmm
+
+ maxLen = 3;
+
+ for (;;)
+ {
+ if (d2 < mmm && *(cur - d2) == *cur)
+ {
+ distances[0] = 2;
+ distances[1] = d2 - 1;
+ distances += 2;
+ if (*(cur - d2 + 2) == cur[2])
+ {
+ // distances[-2] = 3;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ d2 = d3;
+ distances[1] = d3 - 1;
+ distances += 2;
+ }
+ else
+ break;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ d2 = d3;
+ distances[1] = d3 - 1;
+ distances += 2;
+ }
+ else
+ break;
+
+ UPDATE_maxLen
+ distances[-2] = (UInt32)maxLen;
+ if (maxLen == lenLimit)
+ {
+ p->son[p->cyclicBufferPos] = curMatch;
+ MOVE_POS_RET
+ }
+ break;
+ }
+
+ GET_MATCHES_FOOTER_HC(maxLen)
+}
+
+
+static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ UInt32 mmm;
+ UInt32 h2, h3, d2, d3, maxLen, pos;
+ UInt32 *hash;
+ GET_MATCHES_HEADER(5)
+
+ HASH5_CALC
+
+ hash = p->hash;
+ pos = p->pos;
+
+ d2 = pos - hash [h2];
+ d3 = pos - (hash + kFix3HashSize)[h3];
+ // d4 = pos - (hash + kFix4HashSize)[h4];
+
+ curMatch = (hash + kFix5HashSize)[hv];
+
+ hash [h2] = pos;
+ (hash + kFix3HashSize)[h3] = pos;
+ // (hash + kFix4HashSize)[h4] = pos;
+ (hash + kFix5HashSize)[hv] = pos;
+
+ SET_mmm
+
+ maxLen = 4;
+
+ for (;;)
+ {
+ if (d2 < mmm && *(cur - d2) == *cur)
+ {
+ distances[0] = 2;
+ distances[1] = d2 - 1;
+ distances += 2;
+ if (*(cur - d2 + 2) == cur[2])
+ {
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ distances[1] = d3 - 1;
+ distances += 2;
+ d2 = d3;
+ }
+ else
+ break;
+ }
+ else if (d3 < mmm && *(cur - d3) == *cur)
+ {
+ distances[1] = d3 - 1;
+ distances += 2;
+ d2 = d3;
+ }
+ else
+ break;
+
+ distances[-2] = 3;
+ if (*(cur - d2 + 3) != cur[3])
+ break;
+ UPDATE_maxLen
+ distances[-2] = maxLen;
+ if (maxLen == lenLimit)
+ {
+ p->son[p->cyclicBufferPos] = curMatch;
+ MOVE_POS_RET
+ }
+ break;
+ }
+
+ GET_MATCHES_FOOTER_HC(maxLen)
+}
+
+
+UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+{
+ GET_MATCHES_HEADER(3)
+ HASH_ZIP_CALC
+ curMatch = p->hash[hv];
+ p->hash[hv] = p->pos;
+ GET_MATCHES_FOOTER_HC(2)
+}
+
+
+static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ SKIP_HEADER(2)
+ {
+ HASH2_CALC
+ curMatch = p->hash[hv];
+ p->hash[hv] = p->pos;
+ }
+ SKIP_FOOTER
+}
+
+void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ SKIP_HEADER(3)
+ {
+ HASH_ZIP_CALC
+ curMatch = p->hash[hv];
+ p->hash[hv] = p->pos;
+ }
+ SKIP_FOOTER
+}
+
+static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ SKIP_HEADER(3)
+ {
+ UInt32 h2;
+ UInt32 *hash;
+ HASH3_CALC
+ hash = p->hash;
+ curMatch = (hash + kFix3HashSize)[hv];
+ hash[h2] =
+ (hash + kFix3HashSize)[hv] = p->pos;
+ }
+ SKIP_FOOTER
+}
+
+static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ SKIP_HEADER(4)
+ {
+ UInt32 h2, h3;
+ UInt32 *hash;
+ HASH4_CALC
+ hash = p->hash;
+ curMatch = (hash + kFix4HashSize)[hv];
+ hash [h2] =
+ (hash + kFix3HashSize)[h3] =
+ (hash + kFix4HashSize)[hv] = p->pos;
+ }
+ SKIP_FOOTER
+}
+
+static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ SKIP_HEADER(5)
+ {
+ UInt32 h2, h3;
+ UInt32 *hash;
+ HASH5_CALC
+ hash = p->hash;
+ curMatch = (hash + kFix5HashSize)[hv];
+ hash [h2] =
+ (hash + kFix3HashSize)[h3] =
+ // (hash + kFix4HashSize)[h4] =
+ (hash + kFix5HashSize)[hv] = p->pos;
+ }
+ SKIP_FOOTER
+}
+
+
+#define HC_SKIP_HEADER(minLen) \
+ do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
+ const Byte *cur; \
+ UInt32 *hash; \
+ UInt32 *son; \
+ UInt32 pos = p->pos; \
+ UInt32 num2 = num; \
+ /* (p->pos == p->posLimit) is not allowed here !!! */ \
+ { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
+ num -= num2; \
+ { const UInt32 cycPos = p->cyclicBufferPos; \
+ son = p->son + cycPos; \
+ p->cyclicBufferPos = cycPos + num2; } \
+ cur = p->buffer; \
+ hash = p->hash; \
+ do { \
+ UInt32 curMatch; \
+ UInt32 hv;
+
+
+#define HC_SKIP_FOOTER \
+ cur++; pos++; *son++ = curMatch; \
+ } while (--num2); \
+ p->buffer = cur; \
+ p->pos = pos; \
+ if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
+ }} while(num); \
+
+
+static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ HC_SKIP_HEADER(4)
+
+ UInt32 h2, h3;
+ HASH4_CALC
+ curMatch = (hash + kFix4HashSize)[hv];
+ hash [h2] =
+ (hash + kFix3HashSize)[h3] =
+ (hash + kFix4HashSize)[hv] = pos;
+
+ HC_SKIP_FOOTER
+}
+
+
+static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ HC_SKIP_HEADER(5)
+
+ UInt32 h2, h3;
+ HASH5_CALC
+ curMatch = (hash + kFix5HashSize)[hv];
+ hash [h2] =
+ (hash + kFix3HashSize)[h3] =
+ // (hash + kFix4HashSize)[h4] =
+ (hash + kFix5HashSize)[hv] = pos;
+
+ HC_SKIP_FOOTER
+}
+
+
+void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+{
+ HC_SKIP_HEADER(3)
+
+ HASH_ZIP_CALC
+ curMatch = hash[hv];
+ hash[hv] = pos;
+
+ HC_SKIP_FOOTER
+}
+
+
+void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
+{
+ vTable->Init = (Mf_Init_Func)MatchFinder_Init;
+ vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
+ vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
+ if (!p->btMode)
+ {
+ if (p->numHashBytes <= 4)
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
+ }
+ else
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
+ }
+ }
+ else if (p->numHashBytes == 2)
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
+ }
+ else if (p->numHashBytes == 3)
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
+ }
+ else if (p->numHashBytes == 4)
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
+ }
+ else
+ {
+ vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
+ vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
+ }
+}
+
+
+
+void LzFindPrepare(void)
+{
+ #ifndef FORCE_LZFIND_SATUR_SUB_128
+ #ifdef USE_LZFIND_SATUR_SUB_128
+ LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
+ #ifdef MY_CPU_ARM_OR_ARM64
+ {
+ if (CPU_IsSupported_NEON())
+ {
+ // #pragma message ("=== LzFind NEON")
+ PRF(printf("\n=== LzFind NEON\n"));
+ f = LzFind_SaturSub_128;
+ }
+ // f = 0; // for debug
+ }
+ #else // MY_CPU_ARM_OR_ARM64
+ if (CPU_IsSupported_SSE41())
+ {
+ // #pragma message ("=== LzFind SSE41")
+ PRF(printf("\n=== LzFind SSE41\n"));
+ f = LzFind_SaturSub_128;
+
+ #ifdef USE_LZFIND_SATUR_SUB_256
+ if (CPU_IsSupported_AVX2())
+ {
+ // #pragma message ("=== LzFind AVX2")
+ PRF(printf("\n=== LzFind AVX2\n"));
+ f = LzFind_SaturSub_256;
+ }
+ #endif
+ }
+ #endif // MY_CPU_ARM_OR_ARM64
+ g_LzFind_SaturSub = f;
+ #endif // USE_LZFIND_SATUR_SUB_128
+ #endif // FORCE_LZFIND_SATUR_SUB_128
+}
+
+
+#undef MOVE_POS
+#undef MOVE_POS_RET
+#undef PRF