VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Common/lzma/CpuArch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/Common/lzma/CpuArch.c')
-rw-r--r--src/Common/lzma/CpuArch.c795
1 files changed, 570 insertions, 225 deletions
diff --git a/src/Common/lzma/CpuArch.c b/src/Common/lzma/CpuArch.c
index fa9afe39..33f8a3ab 100644
--- a/src/Common/lzma/CpuArch.c
+++ b/src/Common/lzma/CpuArch.c
@@ -1,187 +1,318 @@
/* CpuArch.c -- CPU specific code
-2021-07-13 : Igor Pavlov : Public domain */
+2023-05-18 : Igor Pavlov : Public domain */
#include "Precomp.h"
+// #include <stdio.h>
+
#include "CpuArch.h"
#ifdef MY_CPU_X86_OR_AMD64
-#if (defined(_MSC_VER) && !defined(MY_CPU_AMD64)) || defined(__GNUC__)
-#define USE_ASM
+#undef NEED_CHECK_FOR_CPUID
+#if !defined(MY_CPU_AMD64)
+#define NEED_CHECK_FOR_CPUID
#endif
-#if !defined(USE_ASM) && _MSC_VER >= 1500
-#include <intrin.h>
+/*
+ cpuid instruction supports (subFunction) parameter in ECX,
+ that is used only with some specific (function) parameter values.
+ But we always use only (subFunction==0).
+*/
+/*
+ __cpuid(): MSVC and GCC/CLANG use same function/macro name
+ but parameters are different.
+ We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
+*/
+
+#if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
+ || defined(__clang__) /* && (__clang_major__ >= 10) */
+
+/* there was some CLANG/GCC compilers that have issues with
+ rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
+ compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
+ The history of __cpuid() changes in CLANG/GCC:
+ GCC:
+ 2007: it preserved ebx for (__PIC__ && __i386__)
+ 2013: it preserved rbx and ebx for __PIC__
+ 2014: it doesn't preserves rbx and ebx anymore
+ we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
+ CLANG:
+ 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
+ Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
+ Do we need __PIC__ test for CLANG or we must care about rbx even if
+ __PIC__ is not defined?
+*/
+
+#define ASM_LN "\n"
+
+#if defined(MY_CPU_AMD64) && defined(__PIC__) \
+ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
+
+#define x86_cpuid_MACRO(p, func) { \
+ __asm__ __volatile__ ( \
+ ASM_LN "mov %%rbx, %q1" \
+ ASM_LN "cpuid" \
+ ASM_LN "xchg %%rbx, %q1" \
+ : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+
+ /* "=&r" selects free register. It can select even rbx, if that register is free.
+ "=&D" for (RDI) also works, but the code can be larger with "=&D"
+ "2"(0) means (subFunction = 0),
+ 2 is (zero-based) index in the output constraint list "=c" (ECX). */
+
+#elif defined(MY_CPU_X86) && defined(__PIC__) \
+ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
+
+#define x86_cpuid_MACRO(p, func) { \
+ __asm__ __volatile__ ( \
+ ASM_LN "mov %%ebx, %k1" \
+ ASM_LN "cpuid" \
+ ASM_LN "xchg %%ebx, %k1" \
+ : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+
+#else
+
+#define x86_cpuid_MACRO(p, func) { \
+ __asm__ __volatile__ ( \
+ ASM_LN "cpuid" \
+ : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+
#endif
-#if defined(USE_ASM) && !defined(MY_CPU_AMD64)
-static UInt32 CheckFlag(UInt32 flag)
-{
- #ifdef _MSC_VER
- __asm pushfd;
- __asm pop EAX;
- __asm mov EDX, EAX;
- __asm xor EAX, flag;
- __asm push EAX;
- __asm popfd;
- __asm pushfd;
- __asm pop EAX;
- __asm xor EAX, EDX;
- __asm push EDX;
- __asm popfd;
- __asm and flag, EAX;
- #else
+
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+ x86_cpuid_MACRO(p, func)
+}
+
+
+Z7_NO_INLINE
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ #if defined(NEED_CHECK_FOR_CPUID)
+ #define EFALGS_CPUID_BIT 21
+ UInt32 a;
__asm__ __volatile__ (
- "pushf\n\t"
- "pop %%EAX\n\t"
- "movl %%EAX,%%EDX\n\t"
- "xorl %0,%%EAX\n\t"
- "push %%EAX\n\t"
- "popf\n\t"
- "pushf\n\t"
- "pop %%EAX\n\t"
- "xorl %%EDX,%%EAX\n\t"
- "push %%EDX\n\t"
- "popf\n\t"
- "andl %%EAX, %0\n\t":
- "=c" (flag) : "c" (flag) :
- "%eax", "%edx");
+ ASM_LN "pushf"
+ ASM_LN "pushf"
+ ASM_LN "pop %0"
+ // ASM_LN "movl %0, %1"
+ // ASM_LN "xorl $0x200000, %0"
+ ASM_LN "btc %1, %0"
+ ASM_LN "push %0"
+ ASM_LN "popf"
+ ASM_LN "pushf"
+ ASM_LN "pop %0"
+ ASM_LN "xorl (%%esp), %0"
+
+ ASM_LN "popf"
+ ASM_LN
+ : "=&r" (a) // "=a"
+ : "i" (EFALGS_CPUID_BIT)
+ );
+ if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
+ return 0;
+ #endif
+ {
+ UInt32 p[4];
+ x86_cpuid_MACRO(p, 0)
+ return p[0];
+ }
+}
+
+#undef ASM_LN
+
+#elif !defined(_MSC_VER)
+
+/*
+// for gcc/clang and other: we can try to use __cpuid macro:
+#include <cpuid.h>
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+ __cpuid(func, p[0], p[1], p[2], p[3]);
+}
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ return (UInt32)__get_cpuid_max(0, NULL);
+}
+*/
+// for unsupported cpuid:
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+ UNUSED_VAR(func)
+ p[0] = p[1] = p[2] = p[3] = 0;
+}
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ return 0;
+}
+
+#else // _MSC_VER
+
+#if !defined(MY_CPU_AMD64)
+
+UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ #if defined(NEED_CHECK_FOR_CPUID)
+ #define EFALGS_CPUID_BIT 21
+ __asm pushfd
+ __asm pushfd
+ /*
+ __asm pop eax
+ // __asm mov edx, eax
+ __asm btc eax, EFALGS_CPUID_BIT
+ __asm push eax
+ */
+ __asm btc dword ptr [esp], EFALGS_CPUID_BIT
+ __asm popfd
+ __asm pushfd
+ __asm pop eax
+ // __asm xor eax, edx
+ __asm xor eax, [esp]
+ // __asm push edx
+ __asm popfd
+ __asm and eax, (1 shl EFALGS_CPUID_BIT)
+ __asm jz end_func
+ #endif
+ __asm push ebx
+ __asm xor eax, eax // func
+ __asm xor ecx, ecx // subFunction (optional) for (func == 0)
+ __asm cpuid
+ __asm pop ebx
+ #if defined(NEED_CHECK_FOR_CPUID)
+ end_func:
#endif
- return flag;
+ __asm ret 0
}
-#define CHECK_CPUID_IS_SUPPORTED if (CheckFlag(1 << 18) == 0 || CheckFlag(1 << 21) == 0) return False;
-#else
-#define CHECK_CPUID_IS_SUPPORTED
-#endif
-#ifndef USE_ASM
- #ifdef _MSC_VER
+void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+ UNUSED_VAR(p)
+ UNUSED_VAR(func)
+ __asm push ebx
+ __asm push edi
+ __asm mov edi, ecx // p
+ __asm mov eax, edx // func
+ __asm xor ecx, ecx // subfunction (optional) for (func == 0)
+ __asm cpuid
+ __asm mov [edi ], eax
+ __asm mov [edi + 4], ebx
+ __asm mov [edi + 8], ecx
+ __asm mov [edi + 12], edx
+ __asm pop edi
+ __asm pop ebx
+ __asm ret 0
+}
+
+#else // MY_CPU_AMD64
+
#if _MSC_VER >= 1600
- #define MY__cpuidex __cpuidex
+ #include <intrin.h>
+ #define MY_cpuidex __cpuidex
#else
-
/*
- __cpuid (function == 4) requires subfunction number in ECX.
+ __cpuid (func == (0 or 7)) requires subfunction number in ECX.
MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
__cpuid() in new MSVC clears ECX.
- __cpuid() in old MSVC (14.00) doesn't clear ECX
- We still can use __cpuid for low (function) values that don't require ECX,
- but __cpuid() in old MSVC will be incorrect for some function values: (function == 4).
+ __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
+ We still can use __cpuid for low (func) values that don't require ECX,
+ but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
- where ECX value is first parameter for FAST_CALL / NO_INLINE function,
- So the caller of MY__cpuidex_HACK() sets ECX as subFunction, and
+ where ECX value is first parameter for FASTCALL / NO_INLINE func,
+ So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
- DON'T remove MY_NO_INLINE and MY_FAST_CALL for MY__cpuidex_HACK() !!!
+DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
*/
-
static
-MY_NO_INLINE
-void MY_FAST_CALL MY__cpuidex_HACK(UInt32 subFunction, int *CPUInfo, UInt32 function)
+Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo)
{
- UNUSED_VAR(subFunction);
- __cpuid(CPUInfo, function);
+ UNUSED_VAR(subFunction)
+ __cpuid(CPUInfo, func);
}
-
- #define MY__cpuidex(info, func, func2) MY__cpuidex_HACK(func2, info, func)
- #pragma message("======== MY__cpuidex_HACK WAS USED ========")
- #endif
- #else
- #define MY__cpuidex(info, func, func2) __cpuid(info, func)
- #pragma message("======== (INCORRECT ?) cpuid WAS USED ========")
- #endif
+ #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
+ #pragma message("======== MY_cpuidex_HACK WAS USED ========")
+ #endif // _MSC_VER >= 1600
+
+#if !defined(MY_CPU_AMD64)
+/* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
+ so we disable inlining here */
+Z7_NO_INLINE
#endif
-
-
-
-
-void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d)
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
{
- #ifdef USE_ASM
-
- #ifdef _MSC_VER
-
- UInt32 a2, b2, c2, d2;
- __asm xor EBX, EBX;
- __asm xor ECX, ECX;
- __asm xor EDX, EDX;
- __asm mov EAX, function;
- __asm cpuid;
- __asm mov a2, EAX;
- __asm mov b2, EBX;
- __asm mov c2, ECX;
- __asm mov d2, EDX;
-
- *a = a2;
- *b = b2;
- *c = c2;
- *d = d2;
-
- #else
-
- __asm__ __volatile__ (
- #if defined(MY_CPU_AMD64) && defined(__PIC__)
- "mov %%rbx, %%rdi;"
- "cpuid;"
- "xchg %%rbx, %%rdi;"
- : "=a" (*a) ,
- "=D" (*b) ,
- #elif defined(MY_CPU_X86) && defined(__PIC__)
- "mov %%ebx, %%edi;"
- "cpuid;"
- "xchgl %%ebx, %%edi;"
- : "=a" (*a) ,
- "=D" (*b) ,
- #else
- "cpuid"
- : "=a" (*a) ,
- "=b" (*b) ,
- #endif
- "=c" (*c) ,
- "=d" (*d)
- : "0" (function), "c"(0) ) ;
-
- #endif
-
- #else
+ MY_cpuidex((int *)p, (int)func, 0);
+}
- int CPUInfo[4];
+Z7_NO_INLINE
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ int a[4];
+ MY_cpuidex(a, 0, 0);
+ return a[0];
+}
- MY__cpuidex(CPUInfo, (int)function, 0);
+#endif // MY_CPU_AMD64
+#endif // _MSC_VER
- *a = (UInt32)CPUInfo[0];
- *b = (UInt32)CPUInfo[1];
- *c = (UInt32)CPUInfo[2];
- *d = (UInt32)CPUInfo[3];
+#if defined(NEED_CHECK_FOR_CPUID)
+#define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
+#else
+#define CHECK_CPUID_IS_SUPPORTED
+#endif
+#undef NEED_CHECK_FOR_CPUID
- #endif
-}
-BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p)
+static
+BoolInt x86cpuid_Func_1(UInt32 *p)
{
CHECK_CPUID_IS_SUPPORTED
- MyCPUID(0, &p->maxFunc, &p->vendor[0], &p->vendor[2], &p->vendor[1]);
- MyCPUID(1, &p->ver, &p->b, &p->c, &p->d);
+ z7_x86_cpuid(p, 1);
return True;
}
-static const UInt32 kVendors[][3] =
+/*
+static const UInt32 kVendors[][1] =
{
- { 0x756E6547, 0x49656E69, 0x6C65746E},
- { 0x68747541, 0x69746E65, 0x444D4163},
- { 0x746E6543, 0x48727561, 0x736C7561}
+ { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
+ { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
+ { 0x746E6543 } // , 0x48727561, 0x736C7561 }
};
+*/
+
+/*
+typedef struct
+{
+ UInt32 maxFunc;
+ UInt32 vendor[3];
+ UInt32 ver;
+ UInt32 b;
+ UInt32 c;
+ UInt32 d;
+} Cx86cpuid;
+
+enum
+{
+ CPU_FIRM_INTEL,
+ CPU_FIRM_AMD,
+ CPU_FIRM_VIA
+};
+int x86cpuid_GetFirm(const Cx86cpuid *p);
+#define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
+#define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf))
+#define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
int x86cpuid_GetFirm(const Cx86cpuid *p)
{
unsigned i;
- for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[i]); i++)
+ for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
{
const UInt32 *v = kVendors[i];
- if (v[0] == p->vendor[0] &&
- v[1] == p->vendor[1] &&
- v[2] == p->vendor[2])
+ if (v[0] == p->vendor[0]
+ // && v[1] == p->vendor[1]
+ // && v[2] == p->vendor[2]
+ )
return (int)i;
}
return -1;
@@ -190,41 +321,55 @@ int x86cpuid_GetFirm(const Cx86cpuid *p)
BoolInt CPU_Is_InOrder()
{
Cx86cpuid p;
- int firm;
UInt32 family, model;
if (!x86cpuid_CheckAndRead(&p))
return True;
- family = x86cpuid_GetFamily(p.ver);
- model = x86cpuid_GetModel(p.ver);
-
- firm = x86cpuid_GetFirm(&p);
+ family = x86cpuid_ver_GetFamily(p.ver);
+ model = x86cpuid_ver_GetModel(p.ver);
- switch (firm)
+ switch (x86cpuid_GetFirm(&p))
{
case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
- /* In-Order Atom CPU */
- model == 0x1C /* 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 */
- || model == 0x26 /* 45 nm, Z6xx */
- || model == 0x27 /* 32 nm, Z2460 */
- || model == 0x35 /* 32 nm, Z2760 */
- || model == 0x36 /* 32 nm, N2xxx, D2xxx */
+ // In-Order Atom CPU
+ model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
+ || model == 0x26 // 45 nm, Z6xx
+ || model == 0x27 // 32 nm, Z2460
+ || model == 0x35 // 32 nm, Z2760
+ || model == 0x36 // 32 nm, N2xxx, D2xxx
)));
case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
}
- return True;
+ return False; // v23 : unknown processors are not In-Order
}
+*/
+
+#ifdef _WIN32
+#include "7zWindows.h"
+#endif
#if !defined(MY_CPU_AMD64) && defined(_WIN32)
-#include <Windows.h>
-static BoolInt CPU_Sys_Is_SSE_Supported()
+
+/* for legacy SSE ia32: there is no user-space cpu instruction to check
+ that OS supports SSE register storing/restoring on context switches.
+ So we need some OS-specific function to check that it's safe to use SSE registers.
+*/
+
+Z7_FORCE_INLINE
+static BoolInt CPU_Sys_Is_SSE_Supported(void)
{
- OSVERSIONINFO vi;
- vi.dwOSVersionInfoSize = sizeof(vi);
- if (!GetVersionEx(&vi))
- return False;
- return (vi.dwMajorVersion >= 5);
+#ifdef _MSC_VER
+ #pragma warning(push)
+ #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
+#endif
+ /* low byte is major version of Windows
+ We suppose that any Windows version since
+ Windows2000 (major == 5) supports SSE registers */
+ return (Byte)GetVersion() >= 5;
+#if defined(_MSC_VER)
+ #pragma warning(pop)
+#endif
}
#define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
#else
@@ -232,94 +377,300 @@ static BoolInt CPU_Sys_Is_SSE_Supported()
#endif
-static UInt32 X86_CPUID_ECX_Get_Flags()
+#if !defined(MY_CPU_AMD64)
+
+BoolInt CPU_IsSupported_CMOV(void)
{
- Cx86cpuid p;
+ UInt32 a[4];
+ if (!x86cpuid_Func_1(&a[0]))
+ return 0;
+ return (a[3] >> 15) & 1;
+}
+
+BoolInt CPU_IsSupported_SSE(void)
+{
+ UInt32 a[4];
CHECK_SYS_SSE_SUPPORT
- if (!x86cpuid_CheckAndRead(&p))
+ if (!x86cpuid_Func_1(&a[0]))
+ return 0;
+ return (a[3] >> 25) & 1;
+}
+
+BoolInt CPU_IsSupported_SSE2(void)
+{
+ UInt32 a[4];
+ CHECK_SYS_SSE_SUPPORT
+ if (!x86cpuid_Func_1(&a[0]))
+ return 0;
+ return (a[3] >> 26) & 1;
+}
+
+#endif
+
+
+static UInt32 x86cpuid_Func_1_ECX(void)
+{
+ UInt32 a[4];
+ CHECK_SYS_SSE_SUPPORT
+ if (!x86cpuid_Func_1(&a[0]))
return 0;
- return p.c;
+ return a[2];
}
-BoolInt CPU_IsSupported_AES()
+BoolInt CPU_IsSupported_AES(void)
{
- return (X86_CPUID_ECX_Get_Flags() >> 25) & 1;
+ return (x86cpuid_Func_1_ECX() >> 25) & 1;
}
-BoolInt CPU_IsSupported_SSSE3()
+BoolInt CPU_IsSupported_SSSE3(void)
{
- return (X86_CPUID_ECX_Get_Flags() >> 9) & 1;
+ return (x86cpuid_Func_1_ECX() >> 9) & 1;
}
-BoolInt CPU_IsSupported_SSE41()
+BoolInt CPU_IsSupported_SSE41(void)
{
- return (X86_CPUID_ECX_Get_Flags() >> 19) & 1;
+ return (x86cpuid_Func_1_ECX() >> 19) & 1;
}
-BoolInt CPU_IsSupported_SHA()
+BoolInt CPU_IsSupported_SHA(void)
{
- Cx86cpuid p;
CHECK_SYS_SSE_SUPPORT
- if (!x86cpuid_CheckAndRead(&p))
- return False;
- if (p.maxFunc < 7)
+ if (z7_x86_cpuid_GetMaxFunc() < 7)
return False;
{
- UInt32 d[4] = { 0 };
- MyCPUID(7, &d[0], &d[1], &d[2], &d[3]);
+ UInt32 d[4];
+ z7_x86_cpuid(d, 7);
return (d[1] >> 29) & 1;
}
}
-// #include <stdio.h>
+/*
+MSVC: _xgetbv() intrinsic is available since VS2010SP1.
+ MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
+ <immintrin.h> that we can use or check.
+ For any 32-bit x86 we can use asm code in MSVC,
+ but MSVC asm code is huge after compilation.
+ So _xgetbv() is better
+
+ICC: _xgetbv() intrinsic is available (in what version of ICC?)
+ ICC defines (__GNUC___) and it supports gnu assembler
+ also ICC supports MASM style code with -use-msasm switch.
+ but ICC doesn't support __attribute__((__target__))
+
+GCC/CLANG 9:
+ _xgetbv() is macro that works via __builtin_ia32_xgetbv()
+ and we need __attribute__((__target__("xsave")).
+ But with __target__("xsave") the function will be not
+ inlined to function that has no __target__("xsave") attribute.
+ If we want _xgetbv() call inlining, then we should use asm version
+ instead of calling _xgetbv().
+ Note:intrinsic is broke before GCC 8.2:
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
+*/
-#ifdef _WIN32
-#include <Windows.h>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
+ || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \
+ || defined(__GNUC__) && (__GNUC__ >= 9) \
+ || defined(__clang__) && (__clang_major__ >= 9)
+// we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
+#if defined(__INTEL_COMPILER)
+#define ATTRIB_XGETBV
+#elif defined(__GNUC__) || defined(__clang__)
+// we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
+// #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
+#else
+#define ATTRIB_XGETBV
+#endif
+#endif
+
+#if defined(ATTRIB_XGETBV)
+#include <immintrin.h>
#endif
-BoolInt CPU_IsSupported_AVX2()
+
+// XFEATURE_ENABLED_MASK/XCR0
+#define MY_XCR_XFEATURE_ENABLED_MASK 0
+
+#if defined(ATTRIB_XGETBV)
+ATTRIB_XGETBV
+#endif
+static UInt64 x86_xgetbv_0(UInt32 num)
{
- Cx86cpuid p;
- CHECK_SYS_SSE_SUPPORT
+#if defined(ATTRIB_XGETBV)
+ {
+ return
+ #if (defined(_MSC_VER))
+ _xgetbv(num);
+ #else
+ __builtin_ia32_xgetbv(
+ #if !defined(__clang__)
+ (int)
+ #endif
+ num);
+ #endif
+ }
+
+#elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
+
+ UInt32 a, d;
+ #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+ __asm__
+ (
+ "xgetbv"
+ : "=a"(a), "=d"(d) : "c"(num) : "cc"
+ );
+ #else // is old gcc
+ __asm__
+ (
+ ".byte 0x0f, 0x01, 0xd0" "\n\t"
+ : "=a"(a), "=d"(d) : "c"(num) : "cc"
+ );
+ #endif
+ return ((UInt64)d << 32) | a;
+ // return a;
+
+#elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
+
+ UInt32 a, d;
+ __asm {
+ push eax
+ push edx
+ push ecx
+ mov ecx, num;
+ // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
+ _emit 0x0f
+ _emit 0x01
+ _emit 0xd0
+ mov a, eax
+ mov d, edx
+ pop ecx
+ pop edx
+ pop eax
+ }
+ return ((UInt64)d << 32) | a;
+ // return a;
+
+#else // it's unknown compiler
+ // #error "Need xgetbv function"
+ UNUSED_VAR(num)
+ // for MSVC-X64 we could call external function from external file.
+ /* Actually we had checked OSXSAVE/AVX in cpuid before.
+ So it's expected that OS supports at least AVX and below. */
+ // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
+ return
+ // (1 << 0) | // x87
+ (1 << 1) // SSE
+ | (1 << 2); // AVX
+
+#endif
+}
+#ifdef _WIN32
+/*
+ Windows versions do not know about new ISA extensions that
+ can be introduced. But we still can use new extensions,
+ even if Windows doesn't report about supporting them,
+ But we can use new extensions, only if Windows knows about new ISA extension
+ that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
+ So it's enough to check
+ MY_PF_AVX_INSTRUCTIONS_AVAILABLE
+ instead of
+ MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
+*/
+#define MY_PF_XSAVE_ENABLED 17
+// #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36
+// #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37
+// #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38
+// #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39
+// #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40
+// #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41
+#endif
+
+BoolInt CPU_IsSupported_AVX(void)
+{
#ifdef _WIN32
- #define MY__PF_XSAVE_ENABLED 17
- if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED))
+ if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
return False;
+ /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
+ some latest Win10 revisions. But we need AVX in older Windows also.
+ So we don't use the following check: */
+ /*
+ if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
+ return False;
+ */
#endif
- if (!x86cpuid_CheckAndRead(&p))
+ /*
+ OS must use new special XSAVE/XRSTOR instructions to save
+ AVX registers when it required for context switching.
+ At OS statring:
+ OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
+ Also OS sets bitmask in XCR0 register that defines what
+ registers will be processed by XSAVE instruction:
+ XCR0.SSE[bit 0] - x87 registers and state
+ XCR0.SSE[bit 1] - SSE registers and state
+ XCR0.AVX[bit 2] - AVX registers and state
+ CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
+ So we can read that bit in user-space.
+ XCR0 is available for reading in user-space by new XGETBV instruction.
+ */
+ {
+ const UInt32 c = x86cpuid_Func_1_ECX();
+ if (0 == (1
+ & (c >> 28) // AVX instructions are supported by hardware
+ & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
+ return False;
+ }
+
+ /* also we can check
+ CPUID.1:ECX.XSAVE [bit 26] : that shows that
+ XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
+ But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
+
+ /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
+ in most cases we expect that OS also will support storing/restoring
+ for AVX and SSE states at least.
+ But to be ensure for that we call user-space instruction
+ XGETBV(0) to get XCR0 value that contains bitmask that defines
+ what exact states(registers) OS have enabled for storing/restoring.
+ */
+
+ {
+ const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
+ // printf("\n=== XGetBV=%d\n", bm);
+ return 1
+ & (bm >> 1) // SSE state is supported (set by OS) for storing/restoring
+ & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring
+ }
+ // since Win7SP1: we can use GetEnabledXStateFeatures();
+}
+
+
+BoolInt CPU_IsSupported_AVX2(void)
+{
+ if (!CPU_IsSupported_AVX())
return False;
- if (p.maxFunc < 7)
+ if (z7_x86_cpuid_GetMaxFunc() < 7)
return False;
{
- UInt32 d[4] = { 0 };
- MyCPUID(7, &d[0], &d[1], &d[2], &d[3]);
+ UInt32 d[4];
+ z7_x86_cpuid(d, 7);
// printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
return 1
& (d[1] >> 5); // avx2
}
}
-BoolInt CPU_IsSupported_VAES_AVX2()
+BoolInt CPU_IsSupported_VAES_AVX2(void)
{
- Cx86cpuid p;
- CHECK_SYS_SSE_SUPPORT
-
- #ifdef _WIN32
- #define MY__PF_XSAVE_ENABLED 17
- if (!IsProcessorFeaturePresent(MY__PF_XSAVE_ENABLED))
+ if (!CPU_IsSupported_AVX())
return False;
- #endif
-
- if (!x86cpuid_CheckAndRead(&p))
- return False;
- if (p.maxFunc < 7)
+ if (z7_x86_cpuid_GetMaxFunc() < 7)
return False;
{
- UInt32 d[4] = { 0 };
- MyCPUID(7, &d[0], &d[1], &d[2], &d[3]);
+ UInt32 d[4];
+ z7_x86_cpuid(d, 7);
// printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
return 1
& (d[1] >> 5) // avx2
@@ -328,20 +679,15 @@ BoolInt CPU_IsSupported_VAES_AVX2()
}
}
-BoolInt CPU_IsSupported_PageGB()
+BoolInt CPU_IsSupported_PageGB(void)
{
- Cx86cpuid cpuid;
- if (!x86cpuid_CheckAndRead(&cpuid))
- return False;
+ CHECK_CPUID_IS_SUPPORTED
{
- UInt32 d[4] = { 0 };
- MyCPUID(0x80000000, &d[0], &d[1], &d[2], &d[3]);
+ UInt32 d[4];
+ z7_x86_cpuid(d, 0x80000000);
if (d[0] < 0x80000001)
return False;
- }
- {
- UInt32 d[4] = { 0 };
- MyCPUID(0x80000001, &d[0], &d[1], &d[2], &d[3]);
+ z7_x86_cpuid(d, 0x80000001);
return (d[3] >> 26) & 1;
}
}
@@ -351,11 +697,11 @@ BoolInt CPU_IsSupported_PageGB()
#ifdef _WIN32
-#include <Windows.h>
+#include "7zWindows.h"
-BoolInt CPU_IsSupported_CRC32() { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
-BoolInt CPU_IsSupported_CRYPTO() { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
-BoolInt CPU_IsSupported_NEON() { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
#else
@@ -378,28 +724,27 @@ static void Print_sysctlbyname(const char *name)
}
}
*/
+/*
+ Print_sysctlbyname("hw.pagesize");
+ Print_sysctlbyname("machdep.cpu.brand_string");
+*/
-static BoolInt My_sysctlbyname_Get_BoolInt(const char *name)
+static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
{
UInt32 val = 0;
- if (My_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
+ if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
return 1;
return 0;
}
- /*
- Print_sysctlbyname("hw.pagesize");
- Print_sysctlbyname("machdep.cpu.brand_string");
- */
-
BoolInt CPU_IsSupported_CRC32(void)
{
- return My_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
+ return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
}
BoolInt CPU_IsSupported_NEON(void)
{
- return My_sysctlbyname_Get_BoolInt("hw.optional.neon");
+ return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
}
#ifdef MY_CPU_ARM64
@@ -461,15 +806,15 @@ MY_HWCAP_CHECK_FUNC (AES)
#include <sys/sysctl.h>
-int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
+int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
{
return sysctlbyname(name, buf, bufSize, NULL, 0);
}
-int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
+int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
{
size_t bufSize = sizeof(*val);
- int res = My_sysctlbyname_Get(name, val, &bufSize);
+ const int res = z7_sysctlbyname_Get(name, val, &bufSize);
if (res == 0 && bufSize != sizeof(*val))
return EFAULT;
return res;