From 67618a2c55e0b6860bbb083962cdd28a543bf82a Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 12 Dec 2014 22:50:09 +0100
Subject: [PATCH] Pica/VertexShader: Add support for MOVA, CMP and IFC.

---
 src/video_core/pica.h            |   8 ++
 src/video_core/vertex_shader.cpp | 137 +++++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 7 deletions(-)
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 583614328..87a9e7913 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -771,6 +771,14 @@ struct float24 {
         return ToFloat32() <= flt.ToFloat32();
     }
 
+    bool operator == (const float24& flt) const {
+        return ToFloat32() == flt.ToFloat32();
+    }
+
+    bool operator != (const float24& flt) const {
+        return ToFloat32() != flt.ToFloat32();
+    }
+
 private:
     // Stored as a regular float, merely for convenience
     // TODO: Perform proper arithmetic on this!
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 33a862b74..5d9203c86 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -59,6 +59,8 @@ const std::array<u32, 1024>& GetSwizzlePatterns()
     return swizzle_data;
 }
 
+// TODO: Is there actually a limit on hardware?
+const int if_stack_size = 8;
 
 struct VertexShaderState {
     u32* program_counter;
@@ -67,7 +69,11 @@ struct VertexShaderState {
     float24* output_register_table[7*4];
 
     Math::Vec4<float24> temporary_registers[16];
-    bool status_registers[2];
+    bool conditional_code[2];
+
+    // Two Address registers and one loop counter
+    // TODO: How many bits do these actually have?
+    s32 address_registers[3];
 
     enum {
         INVALID_ADDRESS = 0xFFFFFFFF
@@ -75,6 +81,12 @@ struct VertexShaderState {
     u32 call_stack[8]; // TODO: What is the maximal call stack depth?
     u32* call_stack_pointer;
 
+    struct IfStackElement {
+        u32 else_addr;
+        u32 else_instructions;
+    } if_stack[if_stack_size];
+    IfStackElement* if_stack_pointer;
+
     struct {
         u32 max_offset; // maximum program counter ever reached
         u32 max_opdesc_id; // maximum swizzle pattern index ever used
@@ -107,11 +119,20 @@ static void ProcessShaderCode(VertexShaderState& state) {
         case Instruction::OpCodeType::Arithmetic:
         {
             bool is_inverted = 0 != (instr.opcode.GetInfo().subtype & Instruction::OpCodeInfo::SrcInversed);
-            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted));
+            if (is_inverted) {
+                // We don't really support this properly and/or reliably
+                LOG_ERROR(HW_GPU, "Bad condition...");
+                exit(0);
+            }
+
+            const int address_offset = (instr.common.address_register_index == 0)
+                                       ? 0 : state.address_registers[instr.common.address_register_index - 1];
+
+            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + address_offset);
             const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted));
 
-            const bool negate_src1 = (swizzle.negate_src1 != 0);
-            const bool negate_src2 = (swizzle.negate_src2 != 0);
+            const bool negate_src1 = (swizzle.negate_src1 != false);
+            const bool negate_src2 = (swizzle.negate_src2 != false);
 
             float24 src1[4] = {
                 src1_[(int)swizzle.GetSelectorSrc1(0)],
@@ -217,6 +238,19 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case Instruction::OpCode::MOVA:
+            {
+                for (int i = 0; i < 2; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Figure out how the rounding is done on hardware
+                    state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
+                }
+
+                break;
+            }
+
             case Instruction::OpCode::MOV:
             {
                 for (int i = 0; i < 4; ++i) {
@@ -228,16 +262,56 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case Instruction::OpCode::CMP:
+                for (int i = 0; i < 2; ++i) {
+                    // TODO: Can you restrict to one compare via dest masking?
+
+                    auto compare_op = instr.common.compare_op;
+                    auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
+
+                    switch (op) {
+                        case compare_op.Equal:
+                            state.conditional_code[i] = (src1[i] == src2[i]);
+                            break;
+
+                        case compare_op.NotEqual:
+                            state.conditional_code[i] = (src1[i] != src2[i]);
+                            break;
+
+                        case compare_op.LessThan:
+                            state.conditional_code[i] = (src1[i] <  src2[i]);
+                            break;
+
+                        case compare_op.LessEqual:
+                            state.conditional_code[i] = (src1[i] <= src2[i]);
+                            break;
+
+                        case compare_op.GreaterThan:
+                            state.conditional_code[i] = (src1[i] >  src2[i]);
+                            break;
+
+                        case compare_op.GreaterEqual:
+                            state.conditional_code[i] = (src1[i] >= src2[i]);
+                            break;
+
+                        default:
+                            LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op));
+                            break;
+                    }
+                }
+                break;
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
+                _dbg_assert_(HW_GPU, 0);
                 break;
             }
 
             break;
         }
         default:
-            // Process instruction explicitly
+            // Handle each instruction on its own
             switch (instr.opcode) {
             // NOP is currently used as a heuristic for leaving from a function.
             // TODO: This is completely incorrect.
@@ -265,6 +339,44 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 // TODO
                 break;
 
+            case Instruction::OpCode::IFC:
+            {
+                // TODO: Do we need to consider swizzlers here?
+
+                auto flow_control = instr.flow_control;
+                bool results[3] = { flow_control.refx == state.conditional_code[0],
+                                    flow_control.refy == state.conditional_code[1] };
+
+                switch (flow_control.op) {
+                case flow_control.Or:
+                    results[2] = results[0] || results[1];
+                    break;
+
+                case flow_control.And:
+                    results[2] = results[0] && results[1];
+                    break;
+
+                case flow_control.JustX:
+                    results[2] = results[0];
+                    break;
+
+                case flow_control.JustY:
+                    results[2] = results[1];
+                    break;
+                }
+
+                if (results[2]) {
+                    ++state.if_stack_pointer;
+
+                    state.if_stack_pointer->else_addr = instr.flow_control.dest_offset;
+                    state.if_stack_pointer->else_instructions = instr.flow_control.num_instructions;
+                } else {
+                    state.program_counter = &shader_memory[instr.flow_control.dest_offset] - 1;
+                }
+
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
@@ -277,6 +389,13 @@ static void ProcessShaderCode(VertexShaderState& state) {
         if (increment_pc)
             ++state.program_counter;
 
+        if (state.if_stack_pointer >= &state.if_stack[0]) {
+            if (state.program_counter - shader_memory.data() == state.if_stack_pointer->else_addr) {
+                state.program_counter += state.if_stack_pointer->else_instructions;
+                state.if_stack_pointer--;
+            }
+        }
+
         if (exit_loop)
             break;
     }
@@ -326,11 +445,15 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
             state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
     }
 
-    state.status_registers[0] = false;
-    state.status_registers[1] = false;
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
     boost::fill(state.call_stack, VertexShaderState::INVALID_ADDRESS);
     state.call_stack_pointer = &state.call_stack[0];
 
+    std::fill(state.if_stack, state.if_stack + sizeof(state.if_stack) / sizeof(state.if_stack[0]),
+              VertexShaderState::IfStackElement{VertexShaderState::INVALID_ADDRESS, VertexShaderState::INVALID_ADDRESS});
+    state.if_stack_pointer = state.if_stack - 1; // Meh. TODO: Make this less ugly
+
     ProcessShaderCode(state);
     DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
                            state.debug.max_opdesc_id, registers.vs_main_offset,