From 0a011dc368704c621ea6420e694f515af7970bf3 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Fri, 2 Apr 2021 01:44:36 +0200 Subject: [PATCH] early-access version 1546 --- README.md | 2 +- externals/dynarmic/src/CMakeLists.txt | 1 + .../src/backend/x64/block_of_code.cpp | 21 ++- .../x64/emit_x64_vector_floating_point.cpp | 30 +++- .../src/frontend/A32/decoder/thumb32.inc | 16 +- .../impl/thumb32_load_store_dual.cpp | 146 ++++++++++++++++++ .../A32/translate/impl/translate_thumb.h | 10 ++ 7 files changed, 209 insertions(+), 17 deletions(-) create mode 100755 externals/dynarmic/src/frontend/A32/translate/impl/thumb32_load_store_dual.cpp diff --git a/README.md b/README.md index d6a60764c..8e5c29aae 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1543. +This is the source code for early-access 1546. ## Legal Notice diff --git a/externals/dynarmic/src/CMakeLists.txt b/externals/dynarmic/src/CMakeLists.txt index 949e50ac8..6f478101f 100755 --- a/externals/dynarmic/src/CMakeLists.txt +++ b/externals/dynarmic/src/CMakeLists.txt @@ -163,6 +163,7 @@ if ("A32" IN_LIST DYNARMIC_FRONTENDS) frontend/A32/translate/impl/thumb32_data_processing_shifted_register.cpp frontend/A32/translate/impl/thumb32_load_byte.cpp frontend/A32/translate/impl/thumb32_load_halfword.cpp + frontend/A32/translate/impl/thumb32_load_store_dual.cpp frontend/A32/translate/impl/thumb32_load_store_multiple.cpp frontend/A32/translate/impl/thumb32_load_word.cpp frontend/A32/translate/impl/thumb32_long_multiply.cpp diff --git a/externals/dynarmic/src/backend/x64/block_of_code.cpp b/externals/dynarmic/src/backend/x64/block_of_code.cpp index 4d15eb73b..752ff7ccf 100755 --- a/externals/dynarmic/src/backend/x64/block_of_code.cpp +++ b/externals/dynarmic/src/backend/x64/block_of_code.cpp @@ -13,6 +13,7 @@ #include "backend/x64/block_of_code.h" #include "backend/x64/perf_map.h" #include "common/assert.h" +#include "common/bit_util.h" #ifdef _WIN32 #include @@ -43,8 +44,8 @@ const std::array BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PA namespace { -constexpr size_t TOTAL_CODE_SIZE = 128 * 1024 * 1024; -constexpr size_t FAR_CODE_OFFSET = 100 * 1024 * 1024; +constexpr size_t TOTAL_CODE_SIZE = 256 * 1024 * 1024; +constexpr size_t FAR_CODE_OFFSET = 200 * 1024 * 1024; constexpr size_t CONSTANT_POOL_SIZE = 2 * 1024 * 1024; class CustomXbyakAllocator : public Xbyak::Allocator { @@ -364,7 +365,21 @@ bool BlockOfCode::HasBMI2() const { } bool BlockOfCode::HasFastBMI2() const { - return DoesCpuSupport(Xbyak::util::Cpu::tBMI2) && !DoesCpuSupport(Xbyak::util::Cpu::tAMD); + if (DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { + // BMI2 instructions such as pdep and pext have been very slow up until Zen 3. + // Check for Zen 3 or newer by its family (0x19). + // See also: https://en.wikichip.org/wiki/amd/cpuid + if (DoesCpuSupport(Xbyak::util::Cpu::tAMD)) { + std::array data{}; + cpu_info.getCpuid(1, data.data()); + const u32 family_base = Common::Bits< 8, 11>(data[0]); + const u32 family_extended = Common::Bits<20, 27>(data[0]); + const u32 family = family_base + family_extended; + return family >= 0x19; + } + return true; + } + return false; } bool BlockOfCode::HasFMA() const { diff --git a/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp b/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp index 9662f825a..a40bb79e0 100755 --- a/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -337,8 +337,12 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins ctx.reg_alloc.DefineValue(inst, result); } +enum CheckInputNaN { + Yes, No, +}; + template class Indexer, typename Function> -void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { +void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn, CheckInputNaN check_input_nan = CheckInputNaN::No, typename NaNHandler::function_type nan_handler = NaNHandler::GetDefault()) { static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -371,15 +375,31 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - code.movaps(nan_mask, xmm_b); code.movaps(result, xmm_a); - FCODE(cmpunordp)(nan_mask, xmm_a); + + if (check_input_nan == CheckInputNaN::Yes) { + if (code.HasAVX()) { + FCODE(vcmpunordp)(nan_mask, xmm_a, xmm_b); + } else { + code.movaps(nan_mask, xmm_b); + FCODE(cmpunordp)(nan_mask, xmm_a); + } + } + if constexpr (std::is_member_function_pointer_v) { (code.*fn)(result, xmm_b); } else { fn(result, xmm_b); } - FCODE(cmpunordp)(nan_mask, result); + + if (check_input_nan == CheckInputNaN::Yes) { + FCODE(cmpunordp)(nan_mask, result); + } else if (code.HasAVX()) { + FCODE(vcmpunordp)(nan_mask, result, result); + } else { + code.movaps(nan_mask, result); + FCODE(cmpunordp)(nan_mask, nan_mask); + } HandleNaNs(code, ctx, fpcr_controlled, {result, xmm_a, xmm_b}, nan_mask, nan_handler); @@ -951,7 +971,7 @@ static void EmitFPVectorMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* in code.andnps(mask, eq); code.orps(result, mask); } - }); + }, CheckInputNaN::Yes); } void EmitX64::EmitFPVectorMax32(EmitContext& ctx, IR::Inst* inst) { diff --git a/externals/dynarmic/src/frontend/A32/decoder/thumb32.inc b/externals/dynarmic/src/frontend/A32/decoder/thumb32.inc index e5def4ca3..2e133989e 100755 --- a/externals/dynarmic/src/frontend/A32/decoder/thumb32.inc +++ b/externals/dynarmic/src/frontend/A32/decoder/thumb32.inc @@ -13,17 +13,17 @@ INST(thumb32_LDMDB, "LDMDB/LDMEA", "1110100100W1nnnniiiiii // Load/Store Dual, Load/Store Exclusive, Table Branch //INST(thumb32_STREX, "STREX", "111010000100--------------------") //INST(thumb32_LDREX, "LDREX", "111010000101--------------------") -//INST(thumb32_STRD_imm_1, "STRD (imm)", "11101000-110--------------------") -//INST(thumb32_STRD_imm_2, "STRD (imm)", "11101001-1-0--------------------") -//INST(thumb32_LDRD_imm_1, "LDRD (lit)", "11101000-1111111----------------") -//INST(thumb32_LDRD_imm_2, "LDRD (lit)", "11101001-1-11111----------------") -//INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000-111--------------------") -//INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001-1-1--------------------") +INST(thumb32_STRD_imm_1, "STRD (imm)", "11101000U110nnnnttttssssiiiiiiii") +INST(thumb32_STRD_imm_2, "STRD (imm)", "11101001U1W0nnnnttttssssiiiiiiii") +INST(thumb32_LDRD_lit_1, "LDRD (lit)", "11101000U1111111ttttssssiiiiiiii") +INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii") +INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii") +INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii") //INST(thumb32_STREXB, "STREXB", "111010001100------------0100----") //INST(thumb32_STREXH, "STREXH", "111010001100------------0101----") //INST(thumb32_STREXD, "STREXD", "111010001100------------0111----") -//INST(thumb32_TBB, "TBB", "111010001101------------0000----") -//INST(thumb32_TBH, "TBH", "111010001101------------0001----") +INST(thumb32_TBB, "TBB", "111010001101nnnn111100000000mmmm") +INST(thumb32_TBH, "TBH", "111010001101nnnn111100000001mmmm") //INST(thumb32_LDREXB, "LDREXB", "111010001101------------0100----") //INST(thumb32_LDREXH, "LDREXH", "111010001101------------0101----") //INST(thumb32_LDREXD, "LDREXD", "111010001101------------0111----") diff --git a/externals/dynarmic/src/frontend/A32/translate/impl/thumb32_load_store_dual.cpp b/externals/dynarmic/src/frontend/A32/translate/impl/thumb32_load_store_dual.cpp new file mode 100755 index 000000000..c3d5323a7 --- /dev/null +++ b/externals/dynarmic/src/frontend/A32/translate/impl/thumb32_load_store_dual.cpp @@ -0,0 +1,146 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2021 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "common/bit_util.h" +#include "frontend/A32/translate/impl/translate_thumb.h" + +namespace Dynarmic::A32 { +static bool ITBlockCheck(const A32::IREmitter& ir) { + return ir.current_location.IT().IsInITBlock() && !ir.current_location.IT().IsLastInITBlock(); +} + +static bool TableBranch(ThumbTranslatorVisitor& v, Reg n, Reg m, bool half) { + if (m == Reg::PC) { + return v.UnpredictableInstruction(); + } + if (ITBlockCheck(v.ir)) { + return v.UnpredictableInstruction(); + } + + const auto reg_m = v.ir.GetRegister(m); + const auto reg_n = v.ir.GetRegister(n); + + IR::U32 halfwords; + if (half) { + const auto data = v.ir.ReadMemory16(v.ir.Add(reg_n, v.ir.LogicalShiftLeft(reg_m, v.ir.Imm8(1)))); + halfwords = v.ir.ZeroExtendToWord(data); + } else { + halfwords = v.ir.ZeroExtendToWord(v.ir.ReadMemory8(v.ir.Add(reg_n, reg_m))); + } + + const auto current_pc = v.ir.Imm32(v.ir.PC()); + const auto branch_value = v.ir.Add(current_pc, v.ir.Add(halfwords, halfwords)); + + v.ir.UpdateUpperLocationDescriptor(); + v.ir.BranchWritePC(branch_value); + v.ir.SetTerm(IR::Term::FastDispatchHint{}); + return false; +} + +static bool LoadDualImmediate(ThumbTranslatorVisitor& v, bool P, bool U, bool W, + Reg n, Reg t, Reg t2, Imm<8> imm8) { + if (W && (n == t || n == t2)) { + return v.UnpredictableInstruction(); + } + if (t == Reg::PC || t2 == Reg::PC || t == t2) { + return v.UnpredictableInstruction(); + } + + const u32 imm = imm8.ZeroExtend() << 2; + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm)); + const IR::U32 address_1 = P ? offset_address + : reg_n; + const IR::U32 address_2 = v.ir.Add(address_1, v.ir.Imm32(4)); + + v.ir.SetRegister(t, v.ir.ReadMemory32(address_1)); + v.ir.SetRegister(t2, v.ir.ReadMemory32(address_2)); + + if (W) { + v.ir.SetRegister(n, offset_address); + } + return true; +} + +static bool LoadDualLiteral(ThumbTranslatorVisitor& v, bool U, bool W, Reg t, Reg t2, Imm<8> imm8) { + if (t == Reg::PC || t2 == Reg::PC || t == t2) { + return v.UnpredictableInstruction(); + } + if (W) { + return v.UnpredictableInstruction(); + } + + const auto imm = imm8.ZeroExtend() << 2; + const auto address_1 = U ? v.ir.Add(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm)) + : v.ir.Sub(v.ir.Imm32(v.ir.AlignPC(4)), v.ir.Imm32(imm)); + const auto address_2 = v.ir.Add(address_1, v.ir.Imm32(4)); + + v.ir.SetRegister(t, v.ir.ReadMemory32(address_1)); + v.ir.SetRegister(t2, v.ir.ReadMemory32(address_2)); + return true; +} + +static bool StoreDual(ThumbTranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + if (W && (n == t || n == t2)) { + return v.UnpredictableInstruction(); + } + if (n == Reg::PC || t == Reg::PC || t2 == Reg::PC) { + return v.UnpredictableInstruction(); + } + + const u32 imm = imm8.ZeroExtend() << 2; + const IR::U32 reg_n = v.ir.GetRegister(n); + const IR::U32 reg_t = v.ir.GetRegister(t); + const IR::U32 reg_t2 = v.ir.GetRegister(t2); + + const IR::U32 offset_address = U ? v.ir.Add(reg_n, v.ir.Imm32(imm)) + : v.ir.Sub(reg_n, v.ir.Imm32(imm)); + const IR::U32 address_1 = P ? offset_address + : reg_n; + const IR::U32 address_2 = v.ir.Add(address_1, v.ir.Imm32(4)); + + v.ir.WriteMemory32(address_1, reg_t); + v.ir.WriteMemory32(address_2, reg_t2); + + if (W) { + v.ir.SetRegister(n, offset_address); + } + return true; +} + +bool ThumbTranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualImmediate(*this, true, U, W, n, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualLiteral(*this, U, true, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8) { + return LoadDualLiteral(*this, U, W, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return StoreDual(*this, false, U, true, n, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8) { + return StoreDual(*this, true, U, W, n, t, t2, imm8); +} + +bool ThumbTranslatorVisitor::thumb32_TBB(Reg n, Reg m) { + return TableBranch(*this, n, m, false); +} + +bool ThumbTranslatorVisitor::thumb32_TBH(Reg n, Reg m) { + return TableBranch(*this, n, m, true); +} + +} // namespace Dynarmic::A32 diff --git a/externals/dynarmic/src/frontend/A32/translate/impl/translate_thumb.h b/externals/dynarmic/src/frontend/A32/translate/impl/translate_thumb.h index 174f601c4..ce43eba50 100755 --- a/externals/dynarmic/src/frontend/A32/translate/impl/translate_thumb.h +++ b/externals/dynarmic/src/frontend/A32/translate/impl/translate_thumb.h @@ -179,6 +179,16 @@ struct ThumbTranslatorVisitor final { bool thumb32_STMIA(bool W, Reg n, Imm<15> reg_list); bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list); + // thumb32 load/store dual, load/store exclusive, table branch instructions + bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_LDRD_lit_2(bool U, bool W, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_STRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_STRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8); + bool thumb32_TBB(Reg n, Reg m); + bool thumb32_TBH(Reg n, Reg m); + // thumb32 data processing (shifted register) instructions bool thumb32_TST_reg(Reg n, Imm<3> imm3, Imm<2> imm2, ShiftType type, Reg m); bool thumb32_AND_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2> imm2, ShiftType type, Reg m);