From 0b6c0afeb7d5eb4a44897303a4e93d6c863c486d Mon Sep 17 00:00:00 2001 From: aroulin Date: Wed, 26 Aug 2015 09:11:07 +0200 Subject: [PATCH 1/2] Common: Import BitSet from Dolphin --- src/common/CMakeLists.txt | 1 + src/common/bit_set.h | 189 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 src/common/bit_set.h diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 7f3712efa..2be6fe996 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -24,6 +24,7 @@ set(SRCS set(HEADERS assert.h bit_field.h + bit_set.h break_points.h chunk_file.h code_block.h diff --git a/src/common/bit_set.h b/src/common/bit_set.h new file mode 100644 index 000000000..85f91e786 --- /dev/null +++ b/src/common/bit_set.h @@ -0,0 +1,189 @@ +// This file is under the public domain. + +#pragma once + +#include +#ifdef _WIN32 +#include +#endif +#include +#include +#include "common/common_types.h" + +// namespace avoids conflict with OS X Carbon; don't use BitSet directly +namespace Common { + +// Helper functions: + +#ifdef _WIN32 +template +static inline int CountSetBits(T v) +{ + // from https://graphics.stanford.edu/~seander/bithacks.html + // GCC has this built in, but MSVC's intrinsic will only emit the actual + // POPCNT instruction, which we're not depending on + v = v - ((v >> 1) & (T)~(T)0/3); + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); + v = (v + (v >> 4)) & (T)~(T)0/255*15; + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; +} +static inline int LeastSignificantSetBit(u8 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u16 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u32 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u64 val) +{ + unsigned long index; + _BitScanForward64(&index, val); + return (int)index; +} +#else +static inline int CountSetBits(u8 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u16 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } +#endif + +// Similar to std::bitset, this is a class which encapsulates a bitset, i.e. +// using the set bits of an integer to represent a set of integers. Like that +// class, it acts like an array of bools: +// BitSet32 bs; +// bs[1] = true; +// but also like the underlying integer ([0] = least significant bit): +// BitSet32 bs2 = ...; +// bs = (bs ^ bs2) & BitSet32(0xffff); +// The following additional functionality is provided: +// - Construction using an initializer list. +// BitSet bs { 1, 2, 4, 8 }; +// - Efficiently iterating through the set bits: +// for (int i : bs) +// [i is the *index* of a set bit] +// (This uses the appropriate CPU instruction to find the next set bit in one +// operation.) +// - Counting set bits using .Count() - see comment on that method. + +// TODO: use constexpr when MSVC gets out of the Dark Ages + +template +class BitSet +{ + static_assert(!std::is_signed::value, "BitSet should not be used with signed types"); +public: + // A reference to a particular bit, returned from operator[]. + class Ref + { + public: + Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} + Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} + operator bool() const { return (m_bs->m_val & m_mask) != 0; } + bool operator=(bool set) + { + m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); + return set; + } + private: + BitSet* m_bs; + IntTy m_mask; + }; + + // A STL-like iterator is required to be able to use range-based for loops. + class Iterator + { + public: + Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} + Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} + Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } + int operator*() { return m_bit; } + Iterator& operator++() + { + if (m_val == 0) + { + m_bit = -1; + } + else + { + int bit = LeastSignificantSetBit(m_val); + m_val &= ~(1 << bit); + m_bit = bit; + } + return *this; + } + Iterator operator++(int _) + { + Iterator other(*this); + ++*this; + return other; + } + bool operator==(Iterator other) const { return m_bit == other.m_bit; } + bool operator!=(Iterator other) const { return m_bit != other.m_bit; } + private: + IntTy m_val; + int m_bit; + }; + + BitSet() : m_val(0) {} + explicit BitSet(IntTy val) : m_val(val) {} + BitSet(std::initializer_list init) + { + m_val = 0; + for (int bit : init) + m_val |= (IntTy)1 << bit; + } + + static BitSet AllTrue(size_t count) + { + return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1)); + } + + Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } + const Ref operator[](size_t bit) const { return (*const_cast(this))[bit]; } + bool operator==(BitSet other) const { return m_val == other.m_val; } + bool operator!=(BitSet other) const { return m_val != other.m_val; } + bool operator<(BitSet other) const { return m_val < other.m_val; } + bool operator>(BitSet other) const { return m_val > other.m_val; } + BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } + BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } + BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } + BitSet operator~() const { return BitSet(~m_val); } + BitSet& operator|=(BitSet other) { return *this = *this | other; } + BitSet& operator&=(BitSet other) { return *this = *this & other; } + BitSet& operator^=(BitSet other) { return *this = *this ^ other; } + operator u32() = delete; + operator bool() { return m_val != 0; } + + // Warning: Even though on modern CPUs this is a single fast instruction, + // Dolphin's official builds do not currently assume POPCNT support on x86, + // so slower explicit bit twiddling is generated. Still should generally + // be faster than a loop. + unsigned int Count() const { return CountSetBits(m_val); } + + Iterator begin() const { Iterator it(m_val, 0); return ++it; } + Iterator end() const { return Iterator(m_val, -1); } + + IntTy m_val; +}; + +} // Common + +typedef Common::BitSet BitSet8; +typedef Common::BitSet BitSet16; +typedef Common::BitSet BitSet32; +typedef Common::BitSet BitSet64; \ No newline at end of file From 179ad35c2e6dff0c367dedb63c47a78c6cd052a5 Mon Sep 17 00:00:00 2001 From: aroulin Date: Wed, 26 Aug 2015 09:12:14 +0200 Subject: [PATCH 2/2] x64: Proper stack alignment in shader JIT function calls Import Dolphin stack handling and register saving routines Also removes the x86 parts from abi files --- src/common/x64/abi.cpp | 419 +++-------------------- src/common/x64/abi.h | 61 ++-- src/common/x64/emitter.h | 40 ++- src/video_core/shader/shader_jit_x64.cpp | 43 +-- src/video_core/shader/shader_jit_x64.h | 3 +- 5 files changed, 111 insertions(+), 455 deletions(-) diff --git a/src/common/x64/abi.cpp b/src/common/x64/abi.cpp index 4c07a6ebe..955eb86ce 100644 --- a/src/common/x64/abi.cpp +++ b/src/common/x64/abi.cpp @@ -22,247 +22,69 @@ using namespace Gen; // Shared code between Win64 and Unix64 -// Sets up a __cdecl function. -void XEmitter::ABI_EmitPrologue(int maxCallParams) -{ -#ifdef _M_IX86 - // Don't really need to do anything -#elif defined(ARCHITECTURE_x86_64) -#if _WIN32 - int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; - // Set up a stack frame so that we can call functions - // TODO: use maxCallParams - SUB(64, R(RSP), Imm8(stacksize)); -#endif -#else -#error Arch not supported +void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) { + size_t shadow = 0; +#if defined(_WIN32) + shadow = 0x20; #endif + + int count = (mask & ABI_ALL_GPRS).Count(); + rsp_alignment -= count * 8; + size_t subtraction = 0; + int fpr_count = (mask & ABI_ALL_FPRS).Count(); + if (fpr_count) { + // If we have any XMMs to save, we must align the stack here. + subtraction = rsp_alignment & 0xf; + } + subtraction += 16 * fpr_count; + size_t xmm_base_subtraction = subtraction; + subtraction += needed_frame_size; + subtraction += shadow; + // Final alignment. + rsp_alignment -= subtraction; + subtraction += rsp_alignment & 0xf; + + *shadowp = shadow; + *subtractionp = subtraction; + *xmm_offsetp = subtraction - xmm_base_subtraction; } -void XEmitter::ABI_EmitEpilogue(int maxCallParams) -{ -#ifdef _M_IX86 - RET(); -#elif defined(ARCHITECTURE_x86_64) -#ifdef _WIN32 - int stacksize = ((maxCallParams+1)&~1)*8 + 8; - ADD(64, R(RSP), Imm8(stacksize)); -#endif - RET(); -#else -#error Arch not supported +size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); + for (int r : mask & ABI_ALL_GPRS) + PUSH((X64Reg)r); -#endif + if (subtraction) + SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); + + for (int x : mask & ABI_ALL_FPRS) { + MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16)); + xmm_offset += 16; + } + + return shadow; } -#ifdef _M_IX86 // All32 +void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); -// Shared code between Win32 and Unix32 -void XEmitter::ABI_CallFunction(const void *func) { - ABI_AlignStack(0); - CALL(func); - ABI_RestoreStack(0); -} + for (int x : mask & ABI_ALL_FPRS) { + MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset)); + xmm_offset += 16; + } -void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { - ABI_AlignStack(1 * 2); - PUSH(16, Imm16(param1)); - CALL(func); - ABI_RestoreStack(1 * 2); -} + if (subtraction) + ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); -void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { - ABI_AlignStack(1 * 2 + 1 * 4); - PUSH(16, Imm16(param2)); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(1 * 2 + 1 * 4); -} - -void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { - ABI_AlignStack(1 * 4); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(1 * 4); -} - -void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { - ABI_AlignStack(2 * 4); - PUSH(32, Imm32(param2)); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(2 * 4); -} - -void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { - ABI_AlignStack(3 * 4); - PUSH(32, Imm32(param3)); - PUSH(32, Imm32(param2)); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(3 * 4); -} - -void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { - ABI_AlignStack(3 * 4); - PUSH(32, ImmPtr(param3)); - PUSH(32, Imm32(param2)); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(3 * 4); -} - -void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { - ABI_AlignStack(4 * 4); - PUSH(32, ImmPtr(param4)); - PUSH(32, Imm32(param3)); - PUSH(32, Imm32(param2)); - PUSH(32, Imm32(param1)); - CALL(func); - ABI_RestoreStack(4 * 4); -} - -void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { - ABI_AlignStack(1 * 4); - PUSH(32, ImmPtr(param1)); - CALL(func); - ABI_RestoreStack(1 * 4); -} - -void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { - ABI_AlignStack(2 * 4); - PUSH(32, arg2); - PUSH(32, ImmPtr(param1)); - CALL(func); - ABI_RestoreStack(2 * 4); -} - -void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { - ABI_AlignStack(3 * 4); - PUSH(32, arg3); - PUSH(32, arg2); - PUSH(32, ImmPtr(param1)); - CALL(func); - ABI_RestoreStack(3 * 4); -} - -void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { - ABI_AlignStack(3 * 4); - PUSH(32, Imm32(param3)); - PUSH(32, ImmPtr(param2)); - PUSH(32, ImmPtr(param1)); - CALL(func); - ABI_RestoreStack(3 * 4); -} - -// Pass a register as a parameter. -void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { - ABI_AlignStack(1 * 4); - PUSH(32, R(reg1)); - CALL(func); - ABI_RestoreStack(1 * 4); -} - -// Pass two registers as parameters. -void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) -{ - ABI_AlignStack(2 * 4); - PUSH(32, R(reg2)); - PUSH(32, R(reg1)); - CALL(func); - ABI_RestoreStack(2 * 4); -} - -void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) -{ - ABI_AlignStack(2 * 4); - PUSH(32, Imm32(param2)); - PUSH(32, arg1); - CALL(func); - ABI_RestoreStack(2 * 4); -} - -void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) -{ - ABI_AlignStack(3 * 4); - PUSH(32, Imm32(param3)); - PUSH(32, Imm32(param2)); - PUSH(32, arg1); - CALL(func); - ABI_RestoreStack(3 * 4); -} - -void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) -{ - ABI_AlignStack(1 * 4); - PUSH(32, arg1); - CALL(func); - ABI_RestoreStack(1 * 4); -} - -void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) -{ - ABI_AlignStack(2 * 4); - PUSH(32, arg2); - PUSH(32, arg1); - CALL(func); - ABI_RestoreStack(2 * 4); -} - -void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { - // Note: 4 * 4 = 16 bytes, so alignment is preserved. - PUSH(EBP); - PUSH(EBX); - PUSH(ESI); - PUSH(EDI); -} - -void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { - POP(EDI); - POP(ESI); - POP(EBX); - POP(EBP); -} - -unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { - frameSize += 4; // reserve space for return address - unsigned int alignedSize = -#ifdef __GNUC__ - (frameSize + 15) & -16; -#else - (frameSize + 3) & -4; -#endif - return alignedSize; -} - - -void XEmitter::ABI_AlignStack(unsigned int frameSize) { -// Mac OS X requires the stack to be 16-byte aligned before every call. -// Linux requires the stack to be 16-byte aligned before calls that put SSE -// vectors on the stack, but since we do not keep track of which calls do that, -// it is effectively every call as well. -// Windows binaries compiled with MSVC do not have such a restriction*, but I -// expect that GCC on Windows acts the same as GCC on Linux in this respect. -// It would be nice if someone could verify this. -// *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times. - unsigned int fillSize = - ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); - if (fillSize != 0) { - SUB(32, R(ESP), Imm8(fillSize)); + for (int r = 15; r >= 0; r--) { + if (mask[r]) + POP((X64Reg)r); } } -void XEmitter::ABI_RestoreStack(unsigned int frameSize) { - unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); - alignedSize -= 4; // return address is POPped at end of call - if (alignedSize != 0) { - ADD(32, R(ESP), Imm8(alignedSize)); - } -} - -#else //64bit - // Common functions void XEmitter::ABI_CallFunction(const void *func) { u64 distance = u64(func) - (u64(code) + 5); @@ -538,143 +360,4 @@ void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, cons } else { CALL(func); } -} - -unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { - return frameSize; -} - -#ifdef _WIN32 - -// The Windows x64 ABI requires XMM6 - XMM15 to be callee saved. 10 regs. -// But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs. -// Let's just save all 16. -const int XMM_STACK_SPACE = 16 * 16; - -// Win64 Specific Code -void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { - //we only want to do this once - PUSH(RBX); - PUSH(RSI); - PUSH(RDI); - PUSH(RBP); - PUSH(R12); - PUSH(R13); - PUSH(R14); - PUSH(R15); - ABI_AlignStack(0); - - // Do this after aligning, because before it's offset by 8. - SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); - for (int i = 0; i < 16; ++i) - MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); -} - -void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { - for (int i = 0; i < 16; ++i) - MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); - ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); - - ABI_RestoreStack(0); - POP(R15); - POP(R14); - POP(R13); - POP(R12); - POP(RBP); - POP(RDI); - POP(RSI); - POP(RBX); -} - -// Win64 Specific Code -void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { - PUSH(RCX); - PUSH(RDX); - PUSH(RSI); - PUSH(RDI); - PUSH(R8); - PUSH(R9); - PUSH(R10); - PUSH(R11); - // TODO: Callers preserve XMM4-5 (XMM0-3 are args.) - ABI_AlignStack(0); -} - -void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { - ABI_RestoreStack(0); - POP(R11); - POP(R10); - POP(R9); - POP(R8); - POP(RDI); - POP(RSI); - POP(RDX); - POP(RCX); -} - -void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { - SUB(64, R(RSP), Imm8(0x28)); -} - -void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { - ADD(64, R(RSP), Imm8(0x28)); -} - -#else -// Unix64 Specific Code -void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { - PUSH(RBX); - PUSH(RBP); - PUSH(R12); - PUSH(R13); - PUSH(R14); - PUSH(R15); - PUSH(R15); //just to align stack. duped push/pop doesn't hurt. - // TODO: XMM? -} - -void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { - POP(R15); - POP(R15); - POP(R14); - POP(R13); - POP(R12); - POP(RBP); - POP(RBX); -} - -void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { - PUSH(RCX); - PUSH(RDX); - PUSH(RSI); - PUSH(RDI); - PUSH(R8); - PUSH(R9); - PUSH(R10); - PUSH(R11); - PUSH(R11); -} - -void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { - POP(R11); - POP(R11); - POP(R10); - POP(R9); - POP(R8); - POP(RDI); - POP(RSI); - POP(RDX); - POP(RCX); -} - -void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { - SUB(64, R(RSP), Imm8(0x08)); -} - -void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { - ADD(64, R(RSP), Imm8(0x08)); -} - -#endif // WIN32 - -#endif // 32bit +} \ No newline at end of file diff --git a/src/common/x64/abi.h b/src/common/x64/abi.h index 7e9c156ae..de6d62fdd 100644 --- a/src/common/x64/abi.h +++ b/src/common/x64/abi.h @@ -1,35 +1,15 @@ -// Copyright (C) 2003 Dolphin Project. - -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, version 2.0 or later versions. - -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License 2.0 for more details. - -// A copy of the GPL 2.0 should have been included with the program. -// If not, see http://www.gnu.org/licenses/ - -// Official SVN repository and contact information can be found at -// http://code.google.com/p/dolphin-emu/ +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. #pragma once -#include "common/common_types.h" +#include "common/bit_set.h" +#include "emitter.h" -// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. +// x64 ABI:s, and helpers to help follow them when JIT-ing code. // All convensions return values in EAX (+ possibly EDX). -// Linux 32-bit, Windows 32-bit (cdecl, System V): -// * Caller pushes left to right -// * Caller fixes stack after call -// * function subtract from stack for local storage only. -// Scratch: EAX ECX EDX -// Callee-save: EBX ESI EDI EBP -// Parameters: - - // Windows 64-bit // * 4-reg "fastcall" variant, very new-skool stack handling // * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_ @@ -44,18 +24,8 @@ // Callee-save: RBX RBP R12 R13 R14 R15 // Parameters: RDI RSI RDX RCX R8 R9 -#ifdef _M_IX86 // 32 bit calling convention, shared by all - -// 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to -// choose regs to put stuff in. -#define ABI_PARAM1 RCX -#define ABI_PARAM2 RDX - -// There are no ABI_PARAM* here, since args are pushed. -// 32-bit bog standard cdecl, shared between linux and windows -// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about. - -#elif ARCHITECTURE_x86_64 // 64 bit calling convention +#define ABI_ALL_FPRS BitSet32(0xffff0000) +#define ABI_ALL_GPRS BitSet32(0x0000ffff) #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention @@ -64,7 +34,11 @@ #define ABI_PARAM3 R8 #define ABI_PARAM4 R9 -#else //64-bit Unix (hopefully MacOSX too) +// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers. +#define ABI_ALL_CALLER_SAVED \ + (BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \ + XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 }) +#else //64-bit Unix / OS X #define ABI_PARAM1 RDI #define ABI_PARAM2 RSI @@ -73,6 +47,13 @@ #define ABI_PARAM5 R8 #define ABI_PARAM6 R9 +// TODO: Avoid pushing all 16 XMM registers when possible. Most functions we call probably +// don't actually clobber them. +#define ABI_ALL_CALLER_SAVED \ + (BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \ + ABI_ALL_FPRS) #endif // WIN32 -#endif // X86 +#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED) + +#define ABI_RETURN RAX diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h index a49cd2cf1..2dd0dc94e 100644 --- a/src/common/x64/emitter.h +++ b/src/common/x64/emitter.h @@ -18,6 +18,7 @@ #pragma once #include "common/assert.h" +#include "common/bit_set.h" #include "common/common_types.h" #include "common/code_block.h" @@ -356,7 +357,7 @@ private: void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); - void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); protected: void Write8(u8 value); @@ -1007,25 +1008,26 @@ public: ABI_CallFunctionC((const void*)func, param1); } - // A function that doesn't have any control over what it will do to regs, - // such as the dispatcher, should be surrounded by these. - void ABI_PushAllCalleeSavedRegsAndAdjustStack(); - void ABI_PopAllCalleeSavedRegsAndAdjustStack(); + /** + * Saves specified registers and adjusts the stack to be 16-byte aligned as required by the ABI + * + * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs) + * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8 + * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack + * @return Size of the shadow space, i.e., offset of the frame + */ + size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); - // A function that doesn't know anything about it's surroundings, should - // be surrounded by these to establish a safe environment, where it can roam free. - // An example is a backpatch injected function. - void ABI_PushAllCallerSavedRegsAndAdjustStack(); - void ABI_PopAllCallerSavedRegsAndAdjustStack(); - - unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); - void ABI_AlignStack(unsigned int frameSize); - void ABI_RestoreStack(unsigned int frameSize); - - // Sets up a __cdecl function. - // Only x64 really needs the parameter count. - void ABI_EmitPrologue(int maxCallParams); - void ABI_EmitEpilogue(int maxCallParams); + /** + * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before + * the matching PushRegistersAndAdjustStack. + * + * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs) + * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8 + * @param needed_frame_size Additional space that was needed + * @warning Stack must be currently 16-byte aligned + */ + void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); #ifdef _M_IX86 static int ABI_GetNumXMMRegs() { return 8; } diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index c7b63a9b7..d6011832c 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -122,6 +122,14 @@ static const X64Reg ONE = XMM14; /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR static const X64Reg NEGBIT = XMM15; +// State registers that must not be modified by external functions calls +// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed +static const BitSet32 persistent_regs = { + UNIFORMS, REGISTERS, // Pointers to register blocks + ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers + ONE+16, NEGBIT+16, // Constants +}; + /// Raw constant for the source register selector that indicates no swizzling is performed static const u8 NO_SRC_REG_SWIZZLE = 0x1b; /// Raw constant for the destination register enable mask that indicates all components are enabled @@ -295,20 +303,8 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) { CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); } -void JitCompiler::Compile_PushCallerSavedXMM() { -#ifndef _WIN32 - SUB(64, R(RSP), Imm8(2 * 16)); - MOVUPS(MDisp(RSP, 16), ONE); - MOVUPS(MDisp(RSP, 0), NEGBIT); -#endif -} - -void JitCompiler::Compile_PopCallerSavedXMM() { -#ifndef _WIN32 - MOVUPS(NEGBIT, MDisp(RSP, 0)); - MOVUPS(ONE, MDisp(RSP, 16)); - ADD(64, R(RSP), Imm8(2 * 16)); -#endif +BitSet32 JitCompiler::PersistentCallerSavedRegs() { + return persistent_regs & ABI_ALL_CALLER_SAVED; } void JitCompiler::Compile_ADD(Instruction instr) { @@ -390,12 +386,9 @@ void JitCompiler::Compile_EX2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); MOVSS(XMM0, R(SRC1)); - // The following will actually break the stack alignment - ABI_PushAllCallerSavedRegsAndAdjustStack(); - Compile_PushCallerSavedXMM(); + ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); ABI_CallFunction(reinterpret_cast(exp2f)); - Compile_PopCallerSavedXMM(); - ABI_PopAllCallerSavedRegsAndAdjustStack(); + ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); MOVAPS(SRC1, R(XMM0)); @@ -406,12 +399,9 @@ void JitCompiler::Compile_LG2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); MOVSS(XMM0, R(SRC1)); - // The following will actually break the stack alignment - ABI_PushAllCallerSavedRegsAndAdjustStack(); - Compile_PushCallerSavedXMM(); + ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); ABI_CallFunction(reinterpret_cast(log2f)); - Compile_PopCallerSavedXMM(); - ABI_PopAllCallerSavedRegsAndAdjustStack(); + ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); MOVAPS(SRC1, R(XMM0)); @@ -560,7 +550,7 @@ void JitCompiler::Compile_NOP(Instruction instr) { } void JitCompiler::Compile_END(Instruction instr) { - ABI_PopAllCalleeSavedRegsAndAdjustStack(); + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); RET(); } @@ -755,7 +745,8 @@ CompiledShader* JitCompiler::Compile() { const auto& code = g_state.vs.program_code; unsigned offset = g_state.regs.vs.main_offset; - ABI_PushAllCalleeSavedRegsAndAdjustStack(); + // The stack pointer is 8 modulo 16 at the entry of a procedure + ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 58828ecc8..8668cfff4 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -77,8 +77,7 @@ private: void Compile_EvaluateCondition(Instruction instr); void Compile_UniformCondition(Instruction instr); - void Compile_PushCallerSavedXMM(); - void Compile_PopCallerSavedXMM(); + BitSet32 PersistentCallerSavedRegs(); /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. unsigned* offset_ptr = nullptr;