From cf48b532b447faa969347fef183c6e8921c4ded2 Mon Sep 17 00:00:00 2001 From: Mounir IDRASSI Date: Tue, 26 Feb 2019 01:50:27 +0100 Subject: Windows: Implement RAM encryption for keys on 64-bit machines using ChaCha12 cipher and t1ha non-cryptographic fast hash (https://github.com/leo-yuriev/t1ha) --- src/Crypto/t1ha_bits.h | 904 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 904 insertions(+) create mode 100644 src/Crypto/t1ha_bits.h (limited to 'src/Crypto/t1ha_bits.h') diff --git a/src/Crypto/t1ha_bits.h b/src/Crypto/t1ha_bits.h new file mode 100644 index 00000000..b78c4129 --- /dev/null +++ b/src/Crypto/t1ha_bits.h @@ -0,0 +1,904 @@ +/* + * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, + * Fast Positive Hash. + * + * Portions Copyright (c) 2010-2018 Leonid Yuriev , + * The 1Hippeus project (t1h). + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* + * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } + * by [Positive Technologies](https://www.ptsecurity.ru) + * + * Briefly, it is a 64-bit Hash Function: + * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, + * but portable and without penalties it can run on any 64-bit CPU. + * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash + * and all others portable hash-functions (which do not use specific + * hardware tricks). + * 3. Not suitable for cryptography. + * + * The Future will Positive. Всё будет хорошо. + * + * ACKNOWLEDGEMENT: + * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) + * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! + */ + +#pragma once + +#if defined(_MSC_VER) +#pragma warning(disable : 4201) /* nameless struct/union */ +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif /* 1800 */ +#endif /* MSVC */ +#include "t1ha.h" + +#ifndef T1HA_USE_FAST_ONESHOT_READ +/* Define it to 1 for little bit faster code. + * Unfortunately this may triggering a false-positive alarms from Valgrind, + * AddressSanitizer and other similar tool. + * So, define it to 0 for calmness if doubt. */ +#define T1HA_USE_FAST_ONESHOT_READ 1 +#endif /* T1HA_USE_FAST_ONESHOT_READ */ + +/*****************************************************************************/ + +#include /* for assert() */ +#include /* for memcpy() */ + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ + __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error Unsupported byte order. +#endif + +#define T1HA_UNALIGNED_ACCESS__UNABLE 0 +#define T1HA_UNALIGNED_ACCESS__SLOW 1 +#define T1HA_UNALIGNED_ACCESS__EFFICIENT 2 + +#ifndef T1HA_SYS_UNALIGNED_ACCESS +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT +#elif defined(__ia32__) +#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT +#elif defined(__e2k__) +#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__SLOW +#elif defined(__ARM_FEATURE_UNALIGNED) +#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT +#else +#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE +#endif +#endif /* T1HA_SYS_UNALIGNED_ACCESS */ + +#define ALIGNMENT_16 2 +#define ALIGNMENT_32 4 +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#define ALIGNMENT_64 8 +#else +#define ALIGNMENT_64 4 +#endif + +#ifndef PAGESIZE +#define PAGESIZE 4096 +#endif /* PAGESIZE */ + +/***************************************************************************/ + +#ifndef __has_builtin +#define __has_builtin(x) (0) +#endif + +#ifndef __has_warning +#define __has_warning(x) (0) +#endif + +#ifndef __has_feature +#define __has_feature(x) (0) +#endif + +#ifndef __has_extension +#define __has_extension(x) (0) +#endif + +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif + +#if __has_feature(address_sanitizer) +#define __SANITIZE_ADDRESS__ 1 +#endif + +#ifndef __optimize +#if defined(__clang__) && !__has_attribute(optimize) +#define __optimize(ops) +#elif defined(__GNUC__) || __has_attribute(optimize) +#define __optimize(ops) __attribute__((optimize(ops))) +#else +#define __optimize(ops) +#endif +#endif /* __optimize */ + +#ifndef __cold +#if defined(__OPTIMIZE__) +#if defined(__e2k__) +#define __cold __optimize(1) __attribute__((cold)) +#elif defined(__clang__) && !__has_attribute(cold) +/* just put infrequently used functions in separate section */ +#define __cold __attribute__((section("text.unlikely"))) __optimize("Os") +#elif defined(__GNUC__) || __has_attribute(cold) +#define __cold __attribute__((cold)) __optimize("Os") +#else +#define __cold __optimize("Os") +#endif +#else +#define __cold +#endif +#endif /* __cold */ + + +#if defined(_MSC_VER) + +#pragma warning(push, 1) + +#include +#define likely(cond) (cond) +#define unlikely(cond) (cond) +#define unreachable() __assume(0) +#define bswap64(v) byteswap_64(v) +#define bswap32(v) byteswap_32(v) +#define bswap16(v) byteswap_16(v) +#define rot64(v, s) rotr64(v, s) +#define rot32(v, s) rotr32(v, s) +#define __always_inline __forceinline + +#ifdef TC_WINDOWS_DRIVER +#undef assert +#define assert ASSERT +#endif + +#if defined(_M_X64) || defined(_M_IA64) +#pragma intrinsic(_umul128) +#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) +#endif + +#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) +#pragma intrinsic(__umulh) +#define mul_64x64_high(a, b) __umulh(a, b) +#endif + +#pragma warning(pop) +#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ + has been removed */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for \ + automatic inline expansion */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4702) /* unreachable code */ + +#define __GNUC_PREREQ(a,b) 0 +#define UINT64_C(value) value ## ULL + +#endif /* Compiler */ + +#ifndef likely +#define likely(cond) (cond) +#endif +#ifndef unlikely +#define unlikely(cond) (cond) +#endif +#ifndef __maybe_unused +#define __maybe_unused +#endif +#ifndef __always_inline +#define __always_inline __inline +#endif +#ifndef unreachable +#define unreachable() \ + do { \ + } while (1) +#endif + + + +#ifndef read_unaligned +#if defined(__GNUC__) || __has_attribute(packed) +typedef struct { + uint8_t unaligned_8; + uint16_t unaligned_16; + uint32_t unaligned_32; + uint64_t unaligned_64; +} __attribute__((packed)) t1ha_unaligned_proxy; +#define read_unaligned(ptr, bits) \ + (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ + t1ha_unaligned_proxy, unaligned_##bits))) \ + ->unaligned_##bits) +#elif defined(_MSC_VER) +#pragma warning( \ + disable : 4235) /* nonstandard extension used: '__unaligned' \ + * keyword not supported on this architecture */ +#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr)) +#else +#pragma pack(push, 1) +typedef struct { + uint8_t unaligned_8; + uint16_t unaligned_16; + uint32_t unaligned_32; + uint64_t unaligned_64; +} t1ha_unaligned_proxy; +#pragma pack(pop) +#define read_unaligned(ptr, bits) \ + (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ + t1ha_unaligned_proxy, unaligned_##bits))) \ + ->unaligned_##bits) +#endif +#endif /* read_unaligned */ + +#ifndef read_aligned +#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) +#define read_aligned(ptr, bits) \ + (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) +#elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) +#define read_aligned(ptr, bits) \ + (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) +#elif __has_attribute(assume_aligned) + +static __always_inline const + uint16_t *__attribute__((assume_aligned(ALIGNMENT_16))) + cast_aligned_16(const void *ptr) { + return (const uint16_t *)ptr; +} +static __always_inline const + uint32_t *__attribute__((assume_aligned(ALIGNMENT_32))) + cast_aligned_32(const void *ptr) { + return (const uint32_t *)ptr; +} +static __always_inline const + uint64_t *__attribute__((assume_aligned(ALIGNMENT_64))) + cast_aligned_64(const void *ptr) { + return (const uint64_t *)ptr; +} + +#define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr)) + +#elif defined(_MSC_VER) +#define read_aligned(ptr, bits) \ + (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr)) +#else +#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) +#endif +#endif /* read_aligned */ + +#ifndef prefetch +#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) && \ + !defined(__ia32__) +#define prefetch(ptr) __builtin_prefetch(ptr) +#elif defined(_M_ARM64) || defined(_M_ARM) +#define prefetch(ptr) __prefetch(ptr) +#else +#define prefetch(ptr) \ + do { \ + (void)(ptr); \ + } while (0) +#endif +#endif /* prefetch */ + +#if __has_warning("-Wconstant-logical-operand") +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wconstant-logical-operand" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wconstant-logical-operand" +#else +#pragma warning disable "constant-logical-operand" +#endif +#endif /* -Wconstant-logical-operand */ + +#if __has_warning("-Wtautological-pointer-compare") +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wtautological-pointer-compare" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wtautological-pointer-compare" +#else +#pragma warning disable "tautological-pointer-compare" +#endif +#endif /* -Wtautological-pointer-compare */ + +/***************************************************************************/ + +#if __GNUC_PREREQ(4, 0) +#pragma GCC visibility push(hidden) +#endif /* __GNUC_PREREQ(4,0) */ + +/*---------------------------------------------------------- Little Endian */ + +#ifndef fetch16_le_aligned +static __always_inline uint16_t fetch16_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_16 == 0); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_aligned(v, 16); +#else + return bswap16(read_aligned(v, 16)); +#endif +} +#endif /* fetch16_le_aligned */ + +#ifndef fetch16_le_unaligned +static __always_inline uint16_t fetch16_le_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + const uint8_t *p = (const uint8_t *)v; + return p[0] | (uint16_t)p[1] << 8; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 16); +#else + return bswap16(read_unaligned(v, 16)); +#endif +} +#endif /* fetch16_le_unaligned */ + +#ifndef fetch32_le_aligned +static __always_inline uint32_t fetch32_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_32 == 0); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_aligned(v, 32); +#else + return bswap32(read_aligned(v, 32)); +#endif +} +#endif /* fetch32_le_aligned */ + +#ifndef fetch32_le_unaligned +static __always_inline uint32_t fetch32_le_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + return fetch16_le_unaligned(v) | + (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 32); +#else + return bswap32(read_unaligned(v, 32)); +#endif +} +#endif /* fetch32_le_unaligned */ + +#ifndef fetch64_le_aligned +static __always_inline uint64_t fetch64_le_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_64 == 0); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_aligned(v, 64); +#else + return bswap64(read_aligned(v, 64)); +#endif +} +#endif /* fetch64_le_aligned */ + +#ifndef fetch64_le_unaligned +static __always_inline uint64_t fetch64_le_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + return fetch32_le_unaligned(v) | + (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return read_unaligned(v, 64); +#else + return bswap64(read_unaligned(v, 64)); +#endif +} +#endif /* fetch64_le_unaligned */ + +static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { + const uint8_t *const p = (const uint8_t *)v; +#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) + /* We can perform a 'oneshot' read, which is little bit faster. */ + const unsigned shift = ((8 - tail) & 7) << 3; + return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); +#else + uint64_t r = 0; + switch (tail & 7) { + default: + unreachable(); +/* fall through */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed byte reordering. */ + case 0: + return fetch64_le_aligned(p); + case 7: + r = (uint64_t)p[6] << 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 32; + /* fall through */ + case 4: + return r + fetch32_le_aligned(p); + case 3: + r = (uint64_t)p[2] << 16; + /* fall through */ + case 2: + return r + fetch16_le_aligned(p); + case 1: + return p[0]; +#else + case 0: + r = p[7] << 8; + /* fall through */ + case 7: + r += p[6]; + r <<= 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 8; + /* fall through */ + case 4: + r += p[3]; + r <<= 8; + /* fall through */ + case 3: + r += p[2]; + r <<= 8; + /* fall through */ + case 2: + r += p[1]; + r <<= 8; + /* fall through */ + case 1: + return r + p[0]; +#endif + } +#endif /* T1HA_USE_FAST_ONESHOT_READ */ +} + +#if T1HA_USE_FAST_ONESHOT_READ && \ + T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ + defined(PAGESIZE) && PAGESIZE > 42 && !defined(__SANITIZE_ADDRESS__) +#define can_read_underside(ptr, size) \ + (((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) +#endif /* T1HA_USE_FAST_ONESHOT_READ */ + +static __always_inline uint64_t tail64_le_unaligned(const void *v, + size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#if defined(can_read_underside) && \ + (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) + /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which + * is little bit faster. Thanks Marcin Żukowski + * for the reminder. */ + const unsigned offset = (8 - tail) & 7; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 8))) { + p -= offset; + return fetch64_le_unaligned(p) >> shift; + } + return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); +#else + uint64_t r = 0; + switch (tail & 7) { + default: + unreachable(); +/* fall through */ +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 0: + return fetch64_le_unaligned(p); + case 7: + r = (uint64_t)p[6] << 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 32; + /* fall through */ + case 4: + return r + fetch32_le_unaligned(p); + case 3: + r = (uint64_t)p[2] << 16; + /* fall through */ + case 2: + return r + fetch16_le_unaligned(p); + case 1: + return p[0]; +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 0: + r = p[7] << 8; + /* fall through */ + case 7: + r += p[6]; + r <<= 8; + /* fall through */ + case 6: + r += p[5]; + r <<= 8; + /* fall through */ + case 5: + r += p[4]; + r <<= 8; + /* fall through */ + case 4: + r += p[3]; + r <<= 8; + /* fall through */ + case 3: + r += p[2]; + r <<= 8; + /* fall through */ + case 2: + r += p[1]; + r <<= 8; + /* fall through */ + case 1: + return r + p[0]; +#endif + } +#endif /* can_read_underside */ +} + +/*------------------------------------------------------------- Big Endian */ + +#ifndef fetch16_be_aligned +static __maybe_unused __always_inline uint16_t +fetch16_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_16 == 0); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_aligned(v, 16); +#else + return bswap16(read_aligned(v, 16)); +#endif +} +#endif /* fetch16_be_aligned */ + +#ifndef fetch16_be_unaligned +static __maybe_unused __always_inline uint16_t +fetch16_be_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + const uint8_t *p = (const uint8_t *)v; + return (uint16_t)p[0] << 8 | p[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 16); +#else + return bswap16(read_unaligned(v, 16)); +#endif +} +#endif /* fetch16_be_unaligned */ + +#ifndef fetch32_be_aligned +static __maybe_unused __always_inline uint32_t +fetch32_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_32 == 0); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_aligned(v, 32); +#else + return bswap32(read_aligned(v, 32)); +#endif +} +#endif /* fetch32_be_aligned */ + +#ifndef fetch32_be_unaligned +static __maybe_unused __always_inline uint32_t +fetch32_be_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + return (uint32_t)fetch16_be_unaligned(v) << 16 | + fetch16_be_unaligned((const uint8_t *)v + 2); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 32); +#else + return bswap32(read_unaligned(v, 32)); +#endif +} +#endif /* fetch32_be_unaligned */ + +#ifndef fetch64_be_aligned +static __maybe_unused __always_inline uint64_t +fetch64_be_aligned(const void *v) { + assert(((uintptr_t)v) % ALIGNMENT_64 == 0); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_aligned(v, 64); +#else + return bswap64(read_aligned(v, 64)); +#endif +} +#endif /* fetch64_be_aligned */ + +#ifndef fetch64_be_unaligned +static __maybe_unused __always_inline uint64_t +fetch64_be_unaligned(const void *v) { +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE + return (uint64_t)fetch32_be_unaligned(v) << 32 | + fetch32_be_unaligned((const uint8_t *)v + 4); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return read_unaligned(v, 64); +#else + return bswap64(read_unaligned(v, 64)); +#endif +} +#endif /* fetch64_be_unaligned */ + +static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, + size_t tail) { + const uint8_t *const p = (const uint8_t *)v; +#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) + /* We can perform a 'oneshot' read, which is little bit faster. */ + const unsigned shift = ((8 - tail) & 7) << 3; + return fetch64_be_aligned(p) >> shift; +#else + switch (tail & 7) { + default: + unreachable(); +/* fall through */ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* For most CPUs this code is better when not byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be_aligned(p); + case 3: + return (uint32_t)fetch16_be_aligned(p) << 8 | p[2]; + case 4: + return fetch32_be_aligned(p); + case 5: + return (uint64_t)fetch32_be_aligned(p) << 8 | p[4]; + case 6: + return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4); + case 7: + return (uint64_t)fetch32_be_aligned(p) << 24 | + (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; + case 0: + return fetch64_be_aligned(p); +#else + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 4: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; + case 5: + return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | + (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; + case 6: + return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | + (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; + case 7: + return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | + (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | + (uint64_t)p[0] << 48; + case 0: + return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | + (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | + (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; +#endif + } +#endif /* T1HA_USE_FAST_ONESHOT_READ */ +} + +static __maybe_unused __always_inline uint64_t +tail64_be_unaligned(const void *v, size_t tail) { + const uint8_t *p = (const uint8_t *)v; +#if defined(can_read_underside) && \ + (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) + /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which + * is little bit faster. Thanks Marcin Żukowski + * for the reminder. */ + const unsigned offset = (8 - tail) & 7; + const unsigned shift = offset << 3; + if (likely(can_read_underside(p, 8))) { + p -= offset; + return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); + } + return fetch64_be_unaligned(p) >> shift; +#else + switch (tail & 7) { + default: + unreachable(); +/* fall through */ +#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* For most CPUs this code is better when not needed + * copying for alignment or byte reordering. */ + case 1: + return p[0]; + case 2: + return fetch16_be_unaligned(p); + case 3: + return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2]; + case 4: + return fetch32_be(p); + case 5: + return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4]; + case 6: + return (uint64_t)fetch32_be_unaligned(p) << 16 | + fetch16_be_unaligned(p + 4); + case 7: + return (uint64_t)fetch32_be_unaligned(p) << 24 | + (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6]; + case 0: + return fetch64_be_unaligned(p); +#else + /* For most CPUs this code is better than a + * copying for alignment and/or byte reordering. */ + case 1: + return p[0]; + case 2: + return p[1] | (uint32_t)p[0] << 8; + case 3: + return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; + case 4: + return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | + (uint32_t)p[0] << 24; + case 5: + return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | + (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; + case 6: + return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | + (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; + case 7: + return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | + (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | + (uint64_t)p[0] << 48; + case 0: + return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | + (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | + (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; +#endif + } +#endif /* can_read_underside */ +} + +/***************************************************************************/ + +#ifndef rot64 +static __always_inline uint64_t rot64(uint64_t v, unsigned s) { + return (v >> s) | (v << (64 - s)); +} +#endif /* rot64 */ + +#ifndef mul_32x32_64 +static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { + return a * (uint64_t)b; +} +#endif /* mul_32x32_64 */ + +#ifndef add64carry_first +static __maybe_unused __always_inline unsigned +add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, 0, &carryout); + return (unsigned)carryout; +#else + *sum = base + addend; + return *sum < addend; +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_fist */ + +#ifndef add64carry_next +static __maybe_unused __always_inline unsigned +add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, carry, &carryout); + return (unsigned)carryout; +#else + *sum = base + addend + carry; + return *sum < addend || (carry && *sum == addend); +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_next */ + +#ifndef add64carry_last +static __maybe_unused __always_inline void +add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { +#if __has_builtin(__builtin_addcll) + unsigned long long carryout; + *sum = __builtin_addcll(base, addend, carry, &carryout); + (void)carryout; +#else + *sum = base + addend + carry; +#endif /* __has_builtin(__builtin_addcll) */ +} +#endif /* add64carry_last */ + +#ifndef mul_64x64_128 +static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, + uint64_t b, + uint64_t *h) { +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + __uint128_t r = (__uint128_t)a * (__uint128_t)b; + /* modern GCC could nicely optimize this */ + *h = (uint64_t)(r >> 64); + return (uint64_t)r; +#elif defined(mul_64x64_high) + *h = mul_64x64_high(a, b); + return a * b; +#else + /* performs 64x64 to 128 bit multiplication */ + const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); + const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); + const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); + const uint64_t hh = mul_32x32_64(a >> 32, b >> 32); + + /* Few simplification are possible here for 32-bit architectures, + * but thus we would lost compatibility with the original 64-bit + * version. Think is very bad idea, because then 32-bit t1ha will + * still (relatively) very slowly and well yet not compatible. */ + uint64_t l; + add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h); + add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h); + return l; +#endif +} +#endif /* mul_64x64_128() */ + +#ifndef mul_64x64_high +static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, + uint64_t b) { + uint64_t h; + mul_64x64_128(a, b, &h); + return h; +} +#endif /* mul_64x64_high */ + +/***************************************************************************/ + +/* 'magic' primes */ +static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); +static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); +static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); +static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); +static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); +static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); +static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); + +/* xor high and low parts of full 128-bit product */ +static __maybe_unused __always_inline uint64_t mux64(uint64_t v, + uint64_t prime) { + uint64_t l, h; + l = mul_64x64_128(v, prime, &h); + return l ^ h; +} + +static __always_inline uint64_t final64(uint64_t a, uint64_t b) { + uint64_t x = (a + rot64(b, 41)) * prime_0; + uint64_t y = (rot64(a, 23) + b) * prime_6; + return mux64(x ^ y, prime_5); +} + +static __always_inline void mixup64(uint64_t *__restrict a, + uint64_t *__restrict b, uint64_t v, + uint64_t prime) { + uint64_t h; + *a ^= mul_64x64_128(*b + v, prime, &h); + *b += h; +} -- cgit v1.2.3