diff --git a/.gitmodules b/.gitmodules index dc69c4415..daf39a780 100644 --- a/.gitmodules +++ b/.gitmodules @@ -18,3 +18,7 @@ path = dependencies/imgui url = https://github.com/ocornut/imgui shallow = true +[submodule "dependencies/metal-cpp"] + path = dependencies/metal-cpp + url = https://github.com/bkaradzic/metal-cpp.git + shallow = true diff --git a/CMakeLists.txt b/CMakeLists.txt index 560728f26..93198e1bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ if (ENABLE_VCPKG) OUTPUT_VARIABLE is_vcpkg_shallow OUTPUT_STRIP_TRAILING_WHITESPACE ) - + if(is_vcpkg_shallow STREQUAL "true") message(STATUS "vcpkg is shallow. Unshallowing it now...") execute_process( @@ -102,10 +102,20 @@ if (UNIX AND NOT APPLE) option(ENABLE_BLUEZ "Build with Bluez support" ON) endif() +if (APPLE) + set(ENABLE_METAL_DEFAULT ON) +else() + set(ENABLE_METAL_DEFAULT OFF) +endif() + option(ENABLE_OPENGL "Enables the OpenGL backend" ON) option(ENABLE_VULKAN "Enables the Vulkan backend" ON) +option(ENABLE_METAL "Enables the Metal backend" ${ENABLE_METAL_DEFAULT}) option(ENABLE_DISCORD_RPC "Enables the Discord Rich Presence feature" ON) +if (ENABLE_METAL AND NOT APPLE) + message(FATAL_ERROR "Metal backend is only supported on Apple platforms") +endif() # input backends if (WIN32) @@ -180,6 +190,12 @@ if (ENABLE_OPENGL) find_package(OpenGL REQUIRED) endif() +if (ENABLE_METAL) + include_directories(${CMAKE_SOURCE_DIR}/dependencies/metal-cpp) + + add_definitions(-DENABLE_METAL=1) +endif() + if (ENABLE_DISCORD_RPC) add_compile_definitions(ENABLE_DISCORD_RPC) add_subdirectory(dependencies/discord-rpc EXCLUDE_FROM_ALL) @@ -206,7 +222,7 @@ endif() if (ENABLE_CUBEB) if (NOT ENABLE_VCPKG) - find_package(cubeb) + find_package(cubeb) endif() if (NOT cubeb_FOUND) option(BUILD_TESTS "" OFF) diff --git a/dependencies/metal-cpp b/dependencies/metal-cpp new file mode 160000 index 000000000..a63bd172d --- /dev/null +++ b/dependencies/metal-cpp @@ -0,0 +1 @@ +Subproject commit a63bd172ddcba73a3d87ca32032b66ad41ddb9a6 diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index d51d58d5a..881a6d6dd 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -523,7 +523,70 @@ add_library(CemuCafe ) if(APPLE) - target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm") + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Vulkan/CocoaSurface.mm + HW/Latte/Renderer/MetalView.mm + HW/Latte/Renderer/MetalView.h + ) +endif() + +if(ENABLE_METAL) + # TODO: sort alphabetically + target_sources(CemuCafe PRIVATE + HW/Latte/Renderer/Metal/MetalRenderer.cpp + HW/Latte/Renderer/Metal/MetalRenderer.h + HW/Latte/Renderer/Metal/MetalCommon.h + HW/Latte/Renderer/Metal/MetalCppImpl.cpp + HW/Latte/Renderer/Metal/MetalLayer.mm + HW/Latte/Renderer/Metal/MetalLayer.h + HW/Latte/Renderer/Metal/MetalLayerHandle.cpp + HW/Latte/Renderer/Metal/MetalLayerHandle.h + HW/Latte/Renderer/Metal/LatteToMtl.cpp + HW/Latte/Renderer/Metal/LatteToMtl.h + HW/Latte/Renderer/Metal/LatteTextureMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureMtl.h + HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureViewMtl.h + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp + HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h + HW/Latte/Renderer/Metal/RendererShaderMtl.cpp + HW/Latte/Renderer/Metal/RendererShaderMtl.h + HW/Latte/Renderer/Metal/CachedFBOMtl.cpp + HW/Latte/Renderer/Metal/CachedFBOMtl.h + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp + HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h + HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp + HW/Latte/Renderer/Metal/MetalBufferAllocator.h + HW/Latte/Renderer/Metal/MetalMemoryManager.cpp + HW/Latte/Renderer/Metal/MetalMemoryManager.h + HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp + HW/Latte/Renderer/Metal/MetalOutputShaderCache.h + HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp + HW/Latte/Renderer/Metal/MetalPipelineCompiler.h + HW/Latte/Renderer/Metal/MetalPipelineCache.cpp + HW/Latte/Renderer/Metal/MetalPipelineCache.h + HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp + HW/Latte/Renderer/Metal/MetalDepthStencilCache.h + HW/Latte/Renderer/Metal/MetalSamplerCache.cpp + HW/Latte/Renderer/Metal/MetalSamplerCache.h + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp + HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h + HW/Latte/Renderer/Metal/MetalQuery.cpp + HW/Latte/Renderer/Metal/MetalQuery.h + HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h + HW/Latte/Renderer/Metal/UtilityShaderSource.h + ) + + target_sources(CemuCafe PRIVATE + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp + HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp + ) + + #target_link_libraries(CemuCafe PRIVATE + # "-framework Metal" + # "-framework QuartzCore" + #) endif() set_property(TARGET CemuCafe PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 1bf3755ee..6a440b69e 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -252,7 +252,16 @@ void InfoLog_PrintActiveSettings() if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kVulkan) { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); - if(!GetConfig().vk_accurate_barriers.GetValue()) + if (!GetConfig().vk_accurate_barriers.GetValue()) + cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); + } + else if (ActiveSettings::GetGraphicsAPI() == GraphicAPI::kMetal) + { + cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Fast math: {}", g_current_game_profile->GetFastMath() ? "true" : "false"); + cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheMode()); + cemuLog_log(LogType::Force, "Position invariance: {}", g_current_game_profile->GetPositionInvariance() ? "true" : "false"); + if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } cemuLog_log(LogType::Force, "Console language: {}", stdx::to_underlying(config.console_language.GetValue())); @@ -1014,7 +1023,7 @@ namespace CafeSystem { // starting with Cemu 1.27.0 /vol/storage_mlc01/ is virtualized, meaning that it doesn't point to one singular host os folder anymore // instead it now uses a more complex solution to source titles with various formats (folder, wud, wua) from the game paths and host mlc path - + // todo - mount /vol/storage_mlc01/ with base priority to the host mlc? // since mounting titles is an expensive operation we have to avoid mounting all titles at once diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 2a83b3fee..ff3978605 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -127,7 +127,7 @@ bool gameProfile_loadIntegerOption(IniParser& iniParser, const char* optionName, { cemuLog_log(LogType::Force, "Value '{}' is out of range for option '{}' in game profile", *option_value, optionName); return false; - } + } } template @@ -224,8 +224,11 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadIntegerOption(&iniParser, "graphics_api", &graphicsApi, -1, 0, 1); if (graphicsApi.value != -1) m_graphics_api = (GraphicAPI)graphicsApi.value; - + gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); + gameProfile_loadBooleanOption2(iniParser, "fastMath", m_fastMath); + gameProfile_loadEnumOption(iniParser, "bufferCacheMode", m_bufferCacheMode); + gameProfile_loadBooleanOption2(iniParser, "positionInvariance", m_positionInvariance); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -277,7 +280,7 @@ bool GameProfile::Load(uint64_t title_id) void GameProfile::Save(uint64_t title_id) { auto gameProfileDir = ActiveSettings::GetConfigPath("gameProfiles"); - if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) + if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) fs::create_directories(gameProfileDir, ex_ec); auto gameProfilePath = gameProfileDir / fmt::format("{:016x}.ini", title_id); FileStream* fs = FileStream::createFile2(gameProfilePath); @@ -308,6 +311,9 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); + WRITE_ENTRY(fastMath); + WRITE_ENTRY(bufferCacheMode); + WRITE_ENTRY(positionInvariance); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -337,6 +343,9 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_fastMath = true; + m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_positionInvariance = false; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -354,9 +363,12 @@ void GameProfile::Reset() // general settings m_loadSharedLibraries = true; m_startWithPadView = false; - + // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_fastMath = true; + m_bufferCacheMode = BufferCacheMode::DevicePrivate; + m_positionInvariance = false; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; @@ -366,4 +378,4 @@ void GameProfile::Reset() // controller settings for (auto& profile : m_controllerProfile) profile.reset(); -} \ No newline at end of file +} diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 6a1f2ebd6..359e6a0ac 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,6 +31,9 @@ class GameProfile [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } + [[nodiscard]] bool GetFastMath() const { return m_fastMath; } + [[nodiscard]] BufferCacheMode GetBufferCacheMode() const { return m_bufferCacheMode; } + [[nodiscard]] bool GetPositionInvariance() const { return m_positionInvariance; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -54,6 +57,9 @@ class GameProfile // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; + bool m_fastMath = true; + BufferCacheMode m_bufferCacheMode = BufferCacheMode::DevicePrivate; + bool m_positionInvariance = false; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/GraphicPack/GraphicPack2.cpp b/src/Cafe/GraphicPack/GraphicPack2.cpp index f21bb89d8..77c89e78d 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.cpp +++ b/src/Cafe/GraphicPack/GraphicPack2.cpp @@ -109,7 +109,7 @@ bool GraphicPack2::LoadGraphicPack(const fs::path& rulesPath, IniParser& rules) gp->SetActivePreset(kv.first, kv.second, false); } - + gp->SetEnabled(enabled); } @@ -141,7 +141,7 @@ bool GraphicPack2::DeactivateGraphicPack(const std::shared_ptr& gr if (!graphic_pack->IsActivated()) return false; - const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), + const auto it = std::find_if(s_active_graphic_packs.begin(), s_active_graphic_packs.end(), [graphic_pack](const GraphicPackPtr& gp) { return gp->GetNormalizedPathString() == graphic_pack->GetNormalizedPathString(); @@ -269,6 +269,8 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) m_renderer_api = RendererAPI::Vulkan; else if (boost::iequals(*option_rendererFilter, "opengl")) m_renderer_api = RendererAPI::OpenGL; + else if (boost::iequals(*option_rendererFilter, "metal")) + m_renderer_api = RendererAPI::Metal; else cemuLog_log(LogType::Force, "Unknown value '{}' for rendererFilter option", *option_rendererFilter); } @@ -348,7 +350,7 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) cemuLog_log(LogType::Force, "Graphic pack \"{}\": Preset in line {} skipped because it has no name option defined", GetNormalizedPathString(), rules.GetCurrentSectionLineNumber()); continue; } - + const auto category = rules.FindOption("category"); const auto condition = rules.FindOption("condition"); const auto default_selected = rules.FindOption("default"); @@ -420,13 +422,13 @@ GraphicPack2::GraphicPack2(fs::path rulesPath, IniParser& rules) { // store by category std::unordered_map> tmp_map; - + // all vars must be defined in the default preset vars before std::vector> mismatchingPresetVars; for (const auto& presetEntry : m_presets) { tmp_map[presetEntry->category].emplace_back(presetEntry); - + for (auto& presetVar : presetEntry->variables) { const auto it = m_preset_vars.find(presetVar.first); @@ -568,7 +570,7 @@ void GraphicPack2::ValidatePresetSelections() // // example: a preset category might be hidden entirely (e.g. due to a separate advanced options dropdown) // how to handle: leave the previously selected preset - // + // // the logic is therefore as follows: // if there is a preset category with at least 1 visible preset entry then make sure one of those is actually selected // for completely hidden preset categories we leave the selection as-is @@ -632,17 +634,17 @@ bool GraphicPack2::SetActivePreset(std::string_view category, std::string_view n // disable currently active preset std::for_each(m_presets.begin(), m_presets.end(), [category](PresetPtr& p) { - if(p->category == category) + if(p->category == category) p->active = false; }); - + if (name.empty()) return true; - + // enable new preset const auto it = std::find_if(m_presets.cbegin(), m_presets.cend(), [category, name](const PresetPtr& preset) { - return preset->category == category && preset->name == name; + return preset->category == category && preset->name == name; }); bool result; @@ -681,12 +683,14 @@ void GraphicPack2::LoadShaders() wchar_t shader_type[256]{}; if (filename.size() < 256 && swscanf(filename.c_str(), L"%" SCNx64 "_%" SCNx64 "_%ls", &shader_base_hash, &shader_aux_hash, shader_type) == 3) { + bool isMetalShader = (shader_type[2] == '_' && shader_type[3] == 'm' && shader_type[4] == 's' && shader_type[5] == 'l'); + if (shader_type[0] == 'p' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::PIXEL, isMetalShader)); else if (shader_type[0] == 'v' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::VERTEX, isMetalShader)); else if (shader_type[0] == 'g' && shader_type[1] == 's') - m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY)); + m_custom_shaders.emplace_back(LoadShader(p, shader_base_hash, shader_aux_hash, GP_SHADER_TYPE::GEOMETRY, isMetalShader)); } else if (filename == L"output.glsl") { @@ -783,7 +787,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + for (const auto& preset : presets) { if (!preset->visible) @@ -793,7 +797,7 @@ std::optional GraphicPack2::GetPresetVariable(const std return it->second; } } - + const auto it = std::find_if(m_preset_vars.cbegin(), m_preset_vars.cend(), [&var_name](auto p) { return p.first == var_name; }); if (it != m_preset_vars.cend()) { @@ -839,7 +843,7 @@ void GraphicPack2::_iterateReplacedFiles(const fs::path& currentPath, bool isAOC virtualMountPath = fs::path("vol/content/") / virtualMountPath; } fscDeviceRedirect_add(virtualMountPath.generic_string(), it.file_size(), it.path().generic_string(), m_fs_priority); - } + } } } @@ -859,7 +863,7 @@ void GraphicPack2::LoadReplacedFiles() std::error_code ec; if (fs::exists(contentPath, ec)) { - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(contentPath, false); } @@ -872,7 +876,7 @@ void GraphicPack2::LoadReplacedFiles() uint64 aocTitleId = CafeSystem::GetForegroundTitleId(); aocTitleId = aocTitleId & 0xFFFFFFFFULL; aocTitleId |= 0x0005000c00000000ULL; - // setup redirections + // setup redirections fscDeviceRedirect_map(); _iterateReplacedFiles(aocPath, true); } @@ -988,7 +992,7 @@ bool GraphicPack2::Activate() // enable patch groups EnablePatches(); - + // load replaced files LoadReplacedFiles(); @@ -1034,7 +1038,7 @@ bool GraphicPack2::Deactivate() m_output_shader_source.clear(); m_upscaling_shader_source.clear(); m_downscaling_shader_source.clear(); - + if (HasCustomVSyncFrequency()) { m_vsync_frequency = -1; @@ -1045,7 +1049,7 @@ bool GraphicPack2::Deactivate() return true; } -const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer) +const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer) { for (const auto& gp : GraphicPack2::GetActiveGraphicPacks()) { @@ -1055,9 +1059,12 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u if (it == gp->m_custom_shaders.end()) continue; - if(isVulkanRenderer && (*it).isPreVulkanShader) + if (isVulkanRenderer && (*it).isPreVulkanShader) continue; + if (isMetalRenderer != (*it).isMetalShader) + continue; + return &it->source; } return nullptr; @@ -1066,7 +1073,7 @@ const std::string* GraphicPack2::FindCustomShaderSource(uint64 shaderBaseHash, u std::unordered_map> GraphicPack2::GetCategorizedPresets(std::vector& order) const { order.clear(); - + std::unordered_map> result; for(const auto& entry : m_presets) { @@ -1075,13 +1082,13 @@ std::unordered_map> GraphicPac if (it == order.cend()) order.emplace_back(entry->category); } - + return result; } bool GraphicPack2::HasShaders() const { - return !GetCustomShaders().empty() + return !GetCustomShaders().empty() || !m_output_shader_source.empty() || !m_upscaling_shader_source.empty() || !m_downscaling_shader_source.empty(); } @@ -1215,7 +1222,7 @@ void GraphicPack2::ApplyShaderPresets(std::string& shader_source) const } } -GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const +GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const { CustomShader shader; @@ -1234,6 +1241,7 @@ GraphicPack2::CustomShader GraphicPack2::LoadShader(const fs::path& path, uint64 shader.shader_aux_hash = shader_aux_hash; shader.type = shader_type; shader.isPreVulkanShader = this->m_version <= 3; + shader.isMetalShader = isMetalShader; return shader; } diff --git a/src/Cafe/GraphicPack/GraphicPack2.h b/src/Cafe/GraphicPack/GraphicPack2.h index 9b6a86d4f..5fca2f441 100644 --- a/src/Cafe/GraphicPack/GraphicPack2.h +++ b/src/Cafe/GraphicPack/GraphicPack2.h @@ -57,7 +57,7 @@ class GraphicPack2 sint32 lod_bias = -1; // in 1/64th steps sint32 relative_lod_bias = -1; // in 1/64th steps sint32 anistropic_value = -1; // 1< vars) : name(name), variables(std::move(vars)) {} Preset(std::string_view category, std::string_view name, std::unordered_map vars) : category(category), name(name), variables(std::move(vars)) {} - + Preset(std::string_view category, std::string_view name, std::string_view condition, std::unordered_map vars) : category(category), name(name), condition(condition), variables(std::move(vars)) {} }; @@ -136,19 +137,19 @@ class GraphicPack2 bool SetActivePreset(std::string_view category, std::string_view name, bool update_visibility = true); bool SetActivePreset(std::string_view name); void UpdatePresetVisibility(); - + void AddConstantsForCurrentPreset(ExpressionParser& ep); bool ResolvePresetConstant(const std::string& varname, double& value) const; [[nodiscard]] const std::vector& GetPresets() const { return m_presets; } [[nodiscard]] std::unordered_map> GetCategorizedPresets(std::vector& order) const; - + // shaders void LoadShaders(); bool HasShaders() const; const std::vector& GetCustomShaders() const { return m_custom_shaders; } - static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer); + static const std::string* FindCustomShaderSource(uint64 shaderBaseHash, uint64 shaderAuxHash, GP_SHADER_TYPE type, bool isVulkanRenderer, bool isMetalRenderer); const std::string& GetOutputShaderSource() const { return m_output_shader_source; } const std::string& GetDownscalingShaderSource() const { return m_downscaling_shader_source; } @@ -194,7 +195,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.AddConstant(var.first, (TType)var.second.second); - } + } } for(const auto& preset : active_presets) { @@ -202,7 +203,7 @@ class GraphicPack2 { for (auto& var : preset->variables) parser.TryAddConstant(var.first, (TType)var.second.second); - } + } } for (auto& var : m_preset_vars) @@ -228,7 +229,7 @@ class GraphicPack2 bool m_activated = false; // set if the graphic pack is currently used by the running game std::vector m_title_ids; bool m_patchedFilesLoaded = false; // set to true once patched files are loaded - + sint32 m_vsync_frequency = -1; sint32 m_fs_priority = 100; @@ -241,12 +242,12 @@ class GraphicPack2 std::vector m_presets; // default preset vars std::unordered_map m_preset_vars; - + std::vector m_custom_shaders; std::vector m_texture_rules; std::string m_output_shader_source, m_upscaling_shader_source, m_downscaling_shader_source; std::unique_ptr m_output_shader, m_upscaling_shader, m_downscaling_shader, m_output_shader_ud, m_upscaling_shader_ud, m_downscaling_shader_ud; - + template bool ParseRule(const ExpressionParser& parser, IniParser& iniParser, const char* option_name, T* value_out) const; @@ -257,7 +258,7 @@ class GraphicPack2 std::vector ParseTitleIds(IniParser& rules, const char* option_name) const; - CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type) const; + CustomShader LoadShader(const fs::path& path, uint64 shader_base_hash, uint64 shader_aux_hash, GP_SHADER_TYPE shader_type, bool isMetalShader) const; void ApplyShaderPresets(std::string& shader_source) const; void LoadReplacedFiles(); void _iterateReplacedFiles(const fs::path& currentPath, bool isAOC); @@ -330,6 +331,6 @@ std::vector GraphicPack2::ParseList(const ExpressionParser& parser, IniParser } catch (const std::invalid_argument&) {} } - + return result; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/FetchShader.cpp b/src/Cafe/HW/Latte/Core/FetchShader.cpp index 6c9893f92..d50447b31 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.cpp +++ b/src/Cafe/HW/Latte/Core/FetchShader.cpp @@ -8,8 +8,12 @@ #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/ISA/LatteInstructions.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/containers/LookupTableL3.h" #include "util/helpers/fspinlock.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include /* SHA1_DIGEST_LENGTH */ #include /* EVP_Digest */ @@ -71,7 +75,7 @@ uint32 LatteShaderRecompiler_getAttributeAlignment(LatteParsedFetchShaderAttribu return 4; } -void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) +void LatteShader_calculateFSKey(LatteFetchShader* fetchShader, uint32* contextRegister) { uint64 key = 0; for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++) @@ -104,11 +108,25 @@ void LatteShader_calculateFSKey(LatteFetchShader* fetchShader) key = std::rotl(key, 8); key += (uint64)attrib->semanticId; key = std::rotl(key, 8); - key += (uint64)(attrib->offset & 3); - key = std::rotl(key, 2); + if (g_renderer->GetType() == RendererAPI::Metal) + key += (uint64)attrib->offset; + else + key += (uint64)(attrib->offset & 3); + key = std::rotl(key, 7); } } // todo - also hash invalid buffer groups? + + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = fetchShader->bufferGroups[g]; + key += (uint64)group.attributeBufferIndex; + key = std::rotl(key, 5); + } + } + fetchShader->key = key; } @@ -146,6 +164,29 @@ void LatteFetchShader::CalculateFetchShaderVkHash() this->vkPipelineHashFragment = h; } +void LatteFetchShader::CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister) +{ +#if ENABLE_METAL + for (sint32 g = 0; g < bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + if (bufferStride % 4 != 0) + mtlFetchVertexManually = true; + + for (sint32 f = 0; f < group.attribCount; f++) + { + auto& attr = group.attrib[f]; + if (attr.offset + GetMtlVertexFormatSize(attr.format) > bufferStride) + mtlFetchVertexManually = true; + } + } +#endif +} + void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr) { uint32 semanticId = instr->getFieldSEM_SEMANTIC_ID(); // location (attribute index inside shader) @@ -161,7 +202,7 @@ void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* pars auto nfa = instr->getField_NUM_FORMAT_ALL(); bool isSigned = instr->getField_FORMAT_COMP_ALL() == LatteClauseInstruction_VTX::FORMAT_COMP::COMP_SIGNED; auto endianSwap = instr->getField_ENDIAN_SWAP(); - + // get buffer cemu_assert_debug(bufferId >= 0xA0 && bufferId < 0xB0); uint32 bufferIndex = (bufferId - 0xA0); @@ -316,7 +357,7 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach // {0x00000002, 0x01800c00, 0x00000000, 0x8a000000, 0x2c00a001, 0x2c151000, 0x000a0000, ...} // size 0x50 // {0x00000002, 0x01801000, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x60 // {0x00000002, 0x01801c00, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x90 - + // our new implementation: // {0x00000002, 0x01800400, 0x00000000, 0x8a000000, 0x0000a001, 0x2c151000, 0x00020000, ...} @@ -326,8 +367,9 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach { // empty fetch shader, seen in Minecraft // these only make sense when vertex shader does not call FS? - LatteShader_calculateFSKey(newFetchShader); + LatteShader_calculateFSKey(newFetchShader, contextRegister); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); return newFetchShader; } @@ -385,8 +427,9 @@ LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::Cach } bufferGroup.vboStride = vboOffset; } - LatteShader_calculateFSKey(newFetchShader); + LatteShader_calculateFSKey(newFetchShader, contextRegister); newFetchShader->CalculateFetchShaderVkHash(); + newFetchShader->CheckIfVerticesNeedManualFetchMtl(contextRegister); // register in cache // its possible that during multi-threaded shader cache loading, two identical (same hash) fetch shaders get created simultaneously @@ -411,7 +454,7 @@ LatteFetchShader::~LatteFetchShader() UnregisterInCache(); } -struct FetchShaderLookupInfo +struct FetchShaderLookupInfo { LatteFetchShader* fetchShader; uint32 programSize; diff --git a/src/Cafe/HW/Latte/Core/FetchShader.h b/src/Cafe/HW/Latte/Core/FetchShader.h index ac57714d0..1e580f430 100644 --- a/src/Cafe/HW/Latte/Core/FetchShader.h +++ b/src/Cafe/HW/Latte/Core/FetchShader.h @@ -46,13 +46,17 @@ struct LatteFetchShader // Vulkan uint64 vkPipelineHashFragment{}; // hash of all fetch shader state that influences the Vulkan graphics pipeline + // Metal + bool mtlFetchVertexManually{}; + // cache info CacheHash m_cacheHash{}; bool m_isRegistered{}; // if true, fetch shader is referenced by cache (RegisterInCache() succeeded) - void CalculateFetchShaderVkHash(); + void CheckIfVerticesNeedManualFetchMtl(uint32* contextRegister); + uint64 getVkPipelineHashFragment() const { return vkPipelineHashFragment; }; static bool isValidBufferIndex(const uint32 index) { return index < 0x10; }; @@ -69,4 +73,4 @@ struct LatteFetchShader static std::unordered_map s_fetchShaderByHash; }; -LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); \ No newline at end of file +LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize); diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp index 716312a39..821651ddf 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp @@ -441,7 +441,7 @@ class BufferCacheNode if (uploadBegin >= uploadEnd) return; // reserve range not within invalidation or range is zero sized - + if (uploadBegin == m_invalidationRangeBegin) { m_invalidationRangeBegin = uploadEnd; @@ -536,7 +536,7 @@ class BufferCacheNode MPTR m_invalidationRangeBegin; MPTR m_invalidationRangeEnd; - BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) + BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd) { flagInUse(); cemu_assert_debug(rangeBegin < rangeEnd); @@ -740,7 +740,7 @@ class BufferCacheNode cemu_assert_debug(rangeEnd <= pageRangeEnd); cemu_assert_debug((rangeBegin & 0xF) == 0); cemu_assert_debug((rangeEnd & 0xF) == 0); - + auto pageInfo = m_pageInfo.data() + pageIndex; pageInfo->hasStreamoutData = true; @@ -805,7 +805,7 @@ class BufferCacheNode s_allCacheNodes.clear(); g_deallocateQueue.clear(); } - + static void ProcessDeallocations() { for(auto& itr : g_deallocateQueue) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp index 85d4cdf7a..7620e6a77 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp @@ -62,7 +62,7 @@ void rectGenerate4thVertex(uint32be* output, uint32be* input0, uint32be* input1, // order of rectangle vertices is // v0 v1 - // v2 v3 + // v2 v3 for (sint32 f = 0; f < vectorLen*4; f++) output[f] = _swapEndianU32(output[f]); @@ -199,11 +199,14 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance #if BOOST_OS_MACOS if(bufferStride % 4 != 0) { - if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + if (g_renderer->GetType() == RendererAPI::Vulkan) { - auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); - vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); - continue; + if (VulkanRenderer* vkRenderer = VulkanRenderer::GetInstance()) + { + auto fixedBuffer = vkRenderer->buffer_genStrideWorkaroundVertexBuffer(bufferAddress, fixedBufferSize, bufferStride); + vkRenderer->buffer_bindVertexStrideWorkaroundBuffer(fixedBuffer.first, fixedBuffer.second, bufferIndex, fixedBufferSize); + continue; + } } } #endif @@ -222,4 +225,4 @@ bool LatteBufferCache_Sync(uint32 minIndex, uint32 maxIndex, uint32 baseInstance if (pixelShader) LatteBufferCache_syncGPUUniformBuffers(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START, LatteConst::ShaderType::Pixel); return true; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index aec51725f..d5eaaecec 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -8,7 +8,7 @@ #include #endif -struct +struct { struct CacheEntry { @@ -113,6 +113,21 @@ uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, L cemu_assert_suspicious(); return 0; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + return count * sizeof(uint16); + return count * sizeof(uint32); + } + if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) + return count * sizeof(uint16); + if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE) + return count * sizeof(uint32); + cemu_assert_suspicious(); + return 0; + } else if(indexType == LatteIndexType::AUTO) return 0; else if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE) @@ -306,6 +321,44 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun indexMax = std::max(count, 1u) - 1; } +template +void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + // TODO: check this + for (sint32 i = 0; i < count; i++) + { + uint32 i0; + if (i % 2 == 0) + i0 = i / 2; + else + i0 = count - 1 - i / 2; + T idx = src[i0]; + indexMin = std::min(indexMin, (uint32)idx); + indexMax = std::max(indexMax, (uint32)idx); + dst[i] = idx; + } +} + +template +void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const betype* src = (betype*)indexDataInput; + T* dst = (T*)indexDataOutput; + for (sint32 i = 0; i < count; i++) + { + T idx = i; + if (idx % 2 == 0) + idx = idx / 2; + else + idx = count - 1 - idx / 2; + dst[i] = idx; + } + indexMin = 0; + indexMax = std::max(count, 1u) - 1; +} + #if defined(ARCH_X86_64) ATTRIBUTE_AVX2 void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) @@ -317,7 +370,7 @@ void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDat sint32 countRemaining = count & 15; if (count16) { - __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, + __m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF); __m256i mMax = _mm256_set_epi16(0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000); __m256i mShuffle16Swap = _mm256_set_epi8(30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); @@ -684,6 +737,29 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 cemu_assert_debug(false); outputCount = count + 1; } + else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal) + { + if (indexType == LatteIndexType::AUTO) + { + if (count <= 0xFFFF) + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U16; + } + else + { + LatteIndices_generateAutoTriangleFanIndices(indexData, indexOutputPtr, count, indexMin, indexMax); + renderIndexType = Renderer::INDEX_TYPE::U32; + } + } + else if (indexType == LatteIndexType::U16_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else if (indexType == LatteIndexType::U32_BE) + LatteIndices_unpackTriangleFanAndConvert(indexData, indexOutputPtr, count, indexMin, indexMax); + else + cemu_assert_debug(false); + outputCount = count; + } else { if (indexType == LatteIndexType::U16_BE) @@ -696,7 +772,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #else - LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #endif } else if (indexType == LatteIndexType::U32_BE) @@ -707,7 +783,7 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #else - LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); #endif } else if (indexType == LatteIndexType::U16_LE) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 2efef5bff..68264772e 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -449,14 +449,6 @@ bool LatteMRT::UpdateCurrentFBO() uint8 colorBufferMask = GetActiveColorBufferMask(pixelShader, LatteGPUState.contextNew); bool depthBufferMask = GetActiveDepthBufferMask(LatteGPUState.contextNew); - // if depth test is not used then detach the depth buffer - bool depthEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_Z_ENABLE(); - bool stencilTestEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); - bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); - - if (!depthEnable && !stencilTestEnable && !backStencilEnable) - depthBufferMask = false; - bool hasResizedTexture = false; // set to true if any of the color buffers or the depth buffer reference a resized texture (via graphic pack texture rules) sLatteRenderTargetState.renderTargetIsResized = false; // real size @@ -723,8 +715,8 @@ void LatteRenderTarget_applyTextureColorClear(LatteTexture* texture, uint32 slic void LatteRenderTarget_applyTextureDepthClear(LatteTexture* texture, uint32 sliceIndex, uint32 mipIndex, bool hasDepthClear, bool hasStencilClear, float depthValue, uint8 stencilValue, uint64 eventCounter) { - if(texture->isDepth) - { + if(texture->isDepth) + { g_renderer->texture_clearDepthSlice(texture, sliceIndex, mipIndex, hasDepthClear, hasStencilClear, depthValue, stencilValue); } else @@ -883,7 +875,7 @@ void LatteRenderTarget_copyToBackbuffer(LatteTextureView* textureView, bool isPa textureView->baseTexture->GetEffectiveSize(effectiveWidth, effectiveHeight, 0); _currentOutputImageWidth = effectiveWidth; _currentOutputImageHeight = effectiveHeight; - + sint32 imageX, imageY; sint32 imageWidth, imageHeight; sint32 fullscreenWidth, fullscreenHeight; @@ -1037,7 +1029,7 @@ void LatteRenderTarget_updateViewport() float vpX = LatteGPUState.contextNew.PA_CL_VPORT_XOFFSET.get_OFFSET() - LatteGPUState.contextNew.PA_CL_VPORT_XSCALE.get_SCALE(); float vpHeight = LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE() / -0.5f; float vpY = LatteGPUState.contextNew.PA_CL_VPORT_YOFFSET.get_OFFSET() + LatteGPUState.contextNew.PA_CL_VPORT_YSCALE.get_SCALE(); - + bool halfZ = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF(); // calculate near/far diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index d9f0a5ddf..e01645842 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -9,10 +9,14 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency #include "Cafe/GraphicPack/GraphicPack2.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/helpers/StringParser.h" #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" #include "util/containers/flat_hash_map.hpp" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#endif #include // experimental new decompiler (WIP) @@ -77,7 +81,7 @@ inline ska::flat_hash_map& LatteSHRC_GetCacheByT if (shaderType == LatteConst::ShaderType::Vertex) return sVertexShaders; else if (shaderType == LatteConst::ShaderType::Geometry) - return sGeometryShaders; + return sGeometryShaders; cemu_assert_debug(shaderType == LatteConst::ShaderType::Pixel); return sPixelShaders; } @@ -205,11 +209,9 @@ void LatteShader_free(LatteDecompilerShader* shader) delete shader; } -// both vertex and geometry/pixel shader depend on PS inputs -// we prepare the PS import info in advance -void LatteShader_UpdatePSInputs(uint32* contextRegisters) +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters) { - // PS control + // PS control uint32 psControl0 = contextRegisters[mmSPI_PS_IN_CONTROL_0]; uint32 spi0_positionEnable = (psControl0 >> 8) & 1; uint32 spi0_positionCentroid = (psControl0 >> 9) & 1; @@ -238,12 +240,12 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) { key += std::rotr(spi0_paramGen, 7); key += std::rotr(spi0_paramGenAddr, 3); - _activePSImportTable.paramGen = spi0_paramGen; - _activePSImportTable.paramGenGPR = spi0_paramGenAddr; + psInputTable->paramGen = spi0_paramGen; + psInputTable->paramGenGPR = spi0_paramGenAddr; } else { - _activePSImportTable.paramGen = 0; + psInputTable->paramGen = 0; } // semantic imports from vertex shader @@ -277,9 +279,9 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) key = std::rotl(key, 7); if (spi0_positionEnable && f == spi0_positionAddr) { - _activePSImportTable.import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; - _activePSImportTable.import[f].isFlat = false; - _activePSImportTable.import[f].isNoPerspective = false; + psInputTable->import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION; + psInputTable->import[f].isFlat = false; + psInputTable->import[f].isNoPerspective = false; key += (uint64)0x33; } else @@ -292,13 +294,20 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters) semanticMask[psSemanticId >> 3] |= (1 << (psSemanticId & 7)); #endif - _activePSImportTable.import[f].semanticId = psSemanticId; - _activePSImportTable.import[f].isFlat = (psInputControl&(1 << 10)) != 0; - _activePSImportTable.import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; + psInputTable->import[f].semanticId = psSemanticId; + psInputTable->import[f].isFlat = (psInputControl&(1 << 10)) != 0; + psInputTable->import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0; } } - _activePSImportTable.key = key; - _activePSImportTable.count = numPSInputs; + psInputTable->key = key; + psInputTable->count = numPSInputs; +} + +// both vertex and geometry/pixel shader depend on PS inputs +// we prepare the PS import info in advance +void LatteShader_UpdatePSInputs(uint32* contextRegisters) +{ + LatteShader_CreatePSInputTable(&_activePSImportTable, contextRegisters); } void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compileAsync) @@ -320,7 +329,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil { shaderType = RendererShader::ShaderType::kGeometry; gpShaderType = GraphicPack2::GP_SHADER_TYPE::GEOMETRY; - } + } else if (shader->shaderType == LatteConst::ShaderType::Pixel) { shaderType = RendererShader::ShaderType::kFragment; @@ -330,7 +339,7 @@ void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compil // check if a custom shader is present std::string shaderSrc; - const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan); + const std::string* customShaderSrc = GraphicPack2::FindCustomShaderSource(shader->baseHash, shader->auxHash, gpShaderType, g_renderer->GetType() == RendererAPI::Vulkan, g_renderer->GetType() == RendererAPI::Metal); if (customShaderSrc) { shaderSrc.assign(*customShaderSrc); @@ -443,7 +452,7 @@ void LatteShader_DumpShader(uint64 baseHash, uint64 auxHash, LatteDecompilerShad { if (!ActiveSettings::DumpShadersEnabled()) return; - + const char* suffix = ""; if (shader->shaderType == LatteConst::ShaderType::Vertex) suffix = "vs"; @@ -500,6 +509,7 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, vsHash += tmp; auto primitiveType = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE(); + // TODO: include always in the hash in case of geometry shader or rect shader if (primitiveType == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS) { vsHash += 13ULL; @@ -514,6 +524,48 @@ void LatteSHRC_UpdateVSBaseHash(uint8* vertexShaderPtr, uint32 vertexShaderSize, if (LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) vsHash += 0x1537; +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + if (usesGeometryShader || _activeFetchShader->mtlFetchVertexManually) + { + for (sint32 g = 0; g < _activeFetchShader->bufferGroups.size(); g++) + { + LatteParsedFetchShaderBufferGroup_t& group = _activeFetchShader->bufferGroups[g]; + uint32 bufferIndex = group.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + vsHash += (uint64)bufferStride; + vsHash = std::rotl(vsHash, 7); + } + } + + if (!usesGeometryShader) + { + // Rasterization + bool rasterizationEnabled = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + + if (rasterizationEnabled) + vsHash += 51ULL; + + // Vertex fetch + if (_activeFetchShader->mtlFetchVertexManually) + vsHash += 349ULL; + } + } +#endif + _shaderBaseHash_vs = vsHash; } @@ -539,6 +591,7 @@ void LatteSHRC_UpdatePSBaseHash(uint8* pixelShaderPtr, uint32 pixelShaderSize, b _calculateShaderProgramHash(psProgramCode, pixelShaderSize, &hashCachePS, &psHash1, &psHash2); // get vertex shader uint64 psHash = psHash1 + psHash2 + _activePSImportTable.key + (usesGeometryShader ? hashCacheGS.prevHash1 : 0ULL); + _shaderBaseHash_ps = psHash; } @@ -572,6 +625,7 @@ uint64 LatteSHRC_CalcVSAuxHash(LatteDecompilerShader* vertexShader, uint32* cont auxHashTex += 0x333; } } + return auxHash + auxHashTex; } @@ -605,6 +659,35 @@ uint64 LatteSHRC_CalcPSAuxHash(LatteDecompilerShader* pixelShader, uint32* conte auxHash = (auxHash << 3) | (auxHash >> 61); auxHash += (uint64)dim; } + + // Textures as render targets + for (uint32 i = 0; i < pixelShader->textureUnitListCount; i++) + { + uint8 t = pixelShader->textureUnitList[i]; + auxHash = std::rotl(auxHash, 11); + auxHash += (uint64)pixelShader->textureRenderTargetIndex[t]; + } + +#if ENABLE_METAL + if (g_renderer->GetType() == RendererAPI::Metal) + { + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto format = LatteMRT::GetColorBufferFormat(i, LatteGPUState.contextNew); + uint8 dataType = (uint8)GetMtlPixelFormatInfo(format, false).dataType; + auxHash = std::rotl(auxHash, 7); + auxHash += (uint64)dataType; + } + + bool hasDepthBuffer = LatteMRT::GetActiveDepthBufferMask(LatteGPUState.contextNew); + if (hasDepthBuffer) + { + auxHash = std::rotl(auxHash, 5); + auxHash += 13u; + } + } +#endif + return auxHash; } @@ -613,10 +696,13 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi LatteDecompilerShader* shader = decompilerOutput.shader; shader->baseHash = baseHash; // copy resource mapping - if(g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() == RendererAPI::Vulkan) shader->resourceMapping = decompilerOutput.resourceMappingVK; - else + else if (g_renderer->GetType() == RendererAPI::OpenGL) shader->resourceMapping = decompilerOutput.resourceMappingGL; + else + shader->resourceMapping = decompilerOutput.resourceMappingMTL; // copy texture info shader->textureUnitMask2 = decompilerOutput.textureUnitMask; // copy streamout info @@ -624,7 +710,8 @@ LatteDecompilerShader* LatteShader_CreateShaderFromDecompilerOutput(LatteDecompi shader->hasStreamoutBufferWrite = decompilerOutput.streamoutBufferWriteMask.any(); // copy uniform offsets // for OpenGL these are retrieved in _prepareSeparableUniforms() - if (g_renderer->GetType() == RendererAPI::Vulkan) + // HACK + if (g_renderer->GetType() != RendererAPI::OpenGL) { shader->uniform.loc_remapped = decompilerOutput.uniformOffsetsVK.offset_remapped; shader->uniform.loc_uniformRegister = decompilerOutput.uniformOffsetsVK.offset_uniformRegister; @@ -684,9 +771,9 @@ void LatteShader_GetDecompilerOptions(LatteDecompilerOptions& options, LatteCons { options.usesGeometryShader = geometryShaderEnabled; options.spirvInstrinsics.hasRoundingModeRTEFloat32 = false; + options.useTFViaSSBO = g_renderer->UseTFViaSSBO(); if (g_renderer->GetType() == RendererAPI::Vulkan) { - options.useTFViaSSBO = VulkanRenderer::GetInstance()->UseTFViaSSBO(); options.spirvInstrinsics.hasRoundingModeRTEFloat32 = VulkanRenderer::GetInstance()->HasSPRIVRoundingModeRTE32(); } options.strictMul = g_current_game_profile->GetAccurateShaderMul() != AccurateShaderMulOption::False; @@ -1009,4 +1096,4 @@ void LatteSHRC_UnloadAll() while(!sPixelShaders.empty()) LatteShader_free(sPixelShaders.begin()->second); cemu_assert_debug(sPixelShaders.empty()); -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteShader.h b/src/Cafe/HW/Latte/Core/LatteShader.h index f8dc6d1a3..85d53b01b 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.h +++ b/src/Cafe/HW/Latte/Core/LatteShader.h @@ -84,6 +84,7 @@ struct LatteShaderPSInputTable } }; +void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters); void LatteShader_UpdatePSInputs(uint32* contextRegisters); LatteShaderPSInputTable* LatteSHRC_GetPSInputTable(); @@ -126,4 +127,4 @@ void LatteShaderCache_writeSeparableGeometryShader(uint64 shaderBaseHash, uint64 void LatteShaderCache_writeSeparablePixelShader(uint64 shaderBaseHash, uint64 shaderAuxHash, uint8* pixelShader, uint32 pixelShaderSize, uint32* contextRegisters, bool usesGeometryShader); // todo - refactor this -sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); \ No newline at end of file +sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType); diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp index 9b24de453..27bbd0173 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp @@ -11,6 +11,10 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/OpenGL/RendererShaderGL.h" #include "Cafe/HW/Latte/Renderer/Vulkan/RendererShaderVk.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#endif #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanPipelineStableCache.h" #include @@ -44,7 +48,7 @@ struct sint32 pixelShaderCount; }shaderCacheScreenStats; -struct +struct { ImTextureID textureTVId; ImTextureID textureDRCId; @@ -65,7 +69,7 @@ FileCache* s_shaderCacheGeneric = nullptr; // contains hardware and version inde #define SHADER_CACHE_TYPE_PIXEL (2) bool LatteShaderCache_readSeparableShader(uint8* shaderInfoData, sint32 shaderInfoSize); -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId); +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId); bool LatteShaderCache_updatePipelineLoadingProgress(); void LatteShaderCache_ShowProgress(const std::function & loadUpdateFunc, bool isPipelines); @@ -272,10 +276,14 @@ static BootSoundPlayer g_bootSndPlayer; void LatteShaderCache_finish() { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() == RendererAPI::Vulkan) RendererShaderVk::ShaderCacheLoading_end(); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_end(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_end(); +#endif } uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId) @@ -358,8 +366,17 @@ void LatteShaderCache_Load() RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId); else if (g_renderer->GetType() == RendererAPI::OpenGL) RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId); +#endif + // get cache file name - const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); + fs::path pathGeneric; + if (g_renderer->GetType() == RendererAPI::Metal) + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlshaders.bin", cacheTitleId); + else + pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId); const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0 const auto pathGenericPre1_16_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:08x}.bin", CafeSystem::GetRPXHashBase()); // before 1.16.0 @@ -446,7 +463,7 @@ void LatteShaderCache_Load() }; LatteShaderCache_ShowProgress(LoadShadersUpdate, false); - + LatteShaderCache_updateCompileQueue(0); // write load time and RAM usage to log file (in dev build) #if BOOST_OS_WINDOWS @@ -459,9 +476,9 @@ void LatteShaderCache_Load() cemuLog_log(LogType::Force, "Shader cache loaded with {} shaders. Commited mem {}MB. Took {}ms", numLoadedShaders, (sint32)(memCommited/1024/1024), timeLoad); #endif LatteShaderCache_finish(); - // if Vulkan then also load pipeline cache - if (g_renderer->GetType() == RendererAPI::Vulkan) - LatteShaderCache_LoadVulkanPipelineCache(cacheTitleId); + // if Vulkan or Metal then also load pipeline cache + if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal) + LatteShaderCache_LoadPipelineCache(cacheTitleId); g_renderer->BeginFrame(true); @@ -491,7 +508,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF { const auto kPopupFlags = ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoDecoration | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_NoFocusOnAppearing | ImGuiWindowFlags_NoNav | ImGuiWindowFlags_AlwaysAutoResize; const auto textColor = 0xFF888888; - + auto lastFrameUpdate = tick_cached(); while (true) @@ -544,7 +561,7 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF std::string text; if (isPipelines) { - text = "Loading cached Vulkan pipelines..."; + text = "Loading cached pipelines..."; } else { @@ -618,13 +635,22 @@ void LatteShaderCache_ShowProgress(const std::function & loadUpdateF } } -void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId) +void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId) { - auto& pipelineCache = VulkanPipelineStableCache::GetInstance(); - g_shaderCacheLoaderState.pipelineFileCount = pipelineCache.BeginLoading(cacheTitleId); + if (g_renderer->GetType() == RendererAPI::Vulkan) + g_shaderCacheLoaderState.pipelineFileCount = VulkanPipelineStableCache::GetInstance().BeginLoading(cacheTitleId); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + g_shaderCacheLoaderState.pipelineFileCount = MetalPipelineCache::GetInstance().BeginLoading(cacheTitleId); +#endif g_shaderCacheLoaderState.loadedPipelines = 0; LatteShaderCache_ShowProgress(LatteShaderCache_updatePipelineLoadingProgress, true); - pipelineCache.EndLoading(); + if (g_renderer->GetType() == RendererAPI::Vulkan) + VulkanPipelineStableCache::GetInstance().EndLoading(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().EndLoading(); +#endif if(Latte_GetStopSignal()) LatteThread_Exit(); } @@ -632,7 +658,14 @@ void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId) bool LatteShaderCache_updatePipelineLoadingProgress() { uint32 pipelinesMissingShaders = 0; - return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); + if (g_renderer->GetType() == RendererAPI::Vulkan) + return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + return MetalPipelineCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders); +#endif + + return false; } uint64 LatteShaderCache_getShaderNameInTransferableCache(uint64 baseHash, uint32 shaderType) @@ -891,13 +924,21 @@ void LatteShaderCache_Close() s_shaderCacheGeneric = nullptr; } if (g_renderer->GetType() == RendererAPI::Vulkan) - RendererShaderVk::ShaderCacheLoading_Close(); - else if (g_renderer->GetType() == RendererAPI::OpenGL) - RendererShaderGL::ShaderCacheLoading_Close(); + RendererShaderVk::ShaderCacheLoading_Close(); + else if (g_renderer->GetType() == RendererAPI::OpenGL) + RendererShaderGL::ShaderCacheLoading_Close(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + RendererShaderMtl::ShaderCacheLoading_Close(); +#endif - // if Vulkan then also close pipeline cache + // if Vulkan or Metal then also close pipeline cache if (g_renderer->GetType() == RendererAPI::Vulkan) VulkanPipelineStableCache::GetInstance().Close(); +#if ENABLE_METAL + else if (g_renderer->GetType() == RendererAPI::Metal) + MetalPipelineCache::GetInstance().Close(); +#endif } #include @@ -913,7 +954,7 @@ void LatteShaderCache_handleDeprecatedCacheFiles(fs::path pathGeneric, fs::path { // ask user if they want to delete or keep the old cache file auto infoMsg = _("Cemu detected that the shader cache for this game is outdated.\nOnly shader caches generated with Cemu 1.25.0 or above are supported.\n\nWe recommend deleting the outdated cache file as it will no longer be used by Cemu."); - + wxMessageDialog dialog(nullptr, infoMsg, _("Outdated shader cache"), wxYES_NO | wxCENTRE | wxICON_EXCLAMATION); diff --git a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp index b8cb0ce1b..09c484e68 100644 --- a/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShaderGL.cpp @@ -26,7 +26,7 @@ bool gxShader_checkIfSuccessfullyLinked(GLuint glProgram) void LatteShader_prepareSeparableUniforms(LatteDecompilerShader* shader) { - if (g_renderer->GetType() == RendererAPI::Vulkan) + if (g_renderer->GetType() != RendererAPI::OpenGL) return; auto shaderGL = (RendererShaderGL*)shader->shader; diff --git a/src/Cafe/HW/Latte/Core/LatteTexture.cpp b/src/Cafe/HW/Latte/Core/LatteTexture.cpp index d88528910..4445fb26b 100644 --- a/src/Cafe/HW/Latte/Core/LatteTexture.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTexture.cpp @@ -170,7 +170,7 @@ void LatteTexture_UnregisterTextureMemoryOccupancy(LatteTexture* texture) } // calculate the actually accessed data range -// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) +// the resulting range is an estimate and may be smaller than the actual slice size (but not larger) void LatteTexture_EstimateMipSliceAccessedDataRange(LatteTexture* texture, sint32 sliceIndex, sint32 mipIndex, LatteTextureSliceMipInfo* sliceMipInfo) { uint32 estAddrStart; @@ -222,7 +222,7 @@ void LatteTexture_InitSliceAndMipInfo(LatteTexture* texture) LatteAddrLib::AddrSurfaceInfo_OUT surfaceInfo; LatteAddrLib::GX2CalculateSurfaceInfo(texture->format, texture->width, texture->height, texture->depth, texture->dim, Latte::MakeGX2TileMode(texture->tileMode), 0, mipIndex, &surfaceInfo); sliceMipInfo->tileMode = surfaceInfo.hwTileMode; - + if (mipIndex == 0) sliceMipInfo->pitch = texture->pitch; // for the base level, use the pitch value configured in hardware else @@ -877,7 +877,7 @@ VIEWCOMPATIBILITY LatteTexture_CanTextureBeRepresentedAsView(LatteTexture* baseT // check pitch if(sliceMipInfo->pitch != pitch) continue; - // check all slices + // check all slices if(LatteAddrLib::TM_IsThickAndMacroTiled(baseTexture->tileMode)) continue; // todo - check only every 4th slice? for (sint32 s=0; sGetMipDepth(m); s++) @@ -978,7 +978,7 @@ LatteTextureView* LatteTexture_CreateMapping(MPTR physAddr, MPTR physMipAddr, si } // note: When creating an existing texture, we only allow mip and slice expansion at the end cemu_assert_debug(depth); - + cemu_assert_debug(!(depth > 1 && dimBase == Latte::E_DIM::DIM_2D)); cemu_assert_debug(!(numSlice > 1 && dimView == Latte::E_DIM::DIM_2D)); // todo, depth and numSlice are redundant @@ -1308,6 +1308,40 @@ LatteTexture::LatteTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddre { this->enableReadback = true; } + + // calculate number of potential mip levels (from effective size) + sint32 effectiveWidth = width; + sint32 effectiveHeight = height; + sint32 effectiveDepth = depth; + if (this->overwriteInfo.hasResolutionOverwrite) + { + effectiveWidth = this->overwriteInfo.width; + effectiveHeight = this->overwriteInfo.height; + effectiveDepth = this->overwriteInfo.depth; + } + this->maxPossibleMipLevels = 1; + if (dim != Latte::E_DIM::DIM_3D) + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } + else + { + for (sint32 i = 0; i < 20; i++) + { + if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) + { + this->maxPossibleMipLevels = i + 1; + break; + } + } + } } LatteTexture::~LatteTexture() diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp index 50aa4d876..25c9f54b3 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLegacy.cpp @@ -13,7 +13,7 @@ struct TexScaleXY float xy[2]; }; -struct +struct { TexScaleXY perUnit[Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE]; // stores actualResolution/effectiveResolution ratio for each texture }LatteTextureScale[static_cast(LatteConst::ShaderType::TotalCount)] = { }; @@ -73,46 +73,16 @@ void LatteTexture_ReloadData(LatteTexture* tex) LatteTextureView* LatteTexture_CreateTexture(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { const auto tex = g_renderer->texture_createTextureEx(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); + // init slice/mip info array LatteTexture_InitSliceAndMipInfo(tex); LatteTexture_RegisterTextureMemoryOccupancy(tex); cemu_assert_debug(mipLevels != 0); - // calculate number of potential mip levels (from effective size) - sint32 effectiveWidth = width; - sint32 effectiveHeight = height; - sint32 effectiveDepth = depth; - if (tex->overwriteInfo.hasResolutionOverwrite) - { - effectiveWidth = tex->overwriteInfo.width; - effectiveHeight = tex->overwriteInfo.height; - effectiveDepth = tex->overwriteInfo.depth; - } - tex->maxPossibleMipLevels = 1; - if (dim != Latte::E_DIM::DIM_3D) - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } - else - { - for (sint32 i = 0; i < 20; i++) - { - if ((effectiveWidth >> i) <= 1 && (effectiveHeight >> i) <= 1 && (effectiveDepth >> i) <= 1) - { - tex->maxPossibleMipLevels = i + 1; - break; - } - } - } + LatteTexture_ReloadData(tex); LatteTC_MarkTextureStillInUse(tex); LatteTC_RegisterTexture(tex); + // create initial view that maps to the whole texture tex->baseView = tex->GetOrCreateView(0, tex->mipLevels, 0, tex->depth); return tex->baseView; @@ -371,4 +341,4 @@ uint64 LatteTexture_getNextUpdateEventCounter() void LatteTexture_init() { -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp index c06a3bf18..b80bd869c 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.cpp @@ -602,7 +602,7 @@ void LatteTextureLoader_loadTextureDataIntoSlice(LatteTexture* hostTexture, sint void LatteTextureLoader_UpdateTextureSliceData(LatteTexture* tex, uint32 sliceIndex, uint32 mipIndex, MPTR physImagePtr, MPTR physMipPtr, Latte::E_DIM dim, uint32 width, uint32 height, uint32 depth, uint32 mipLevels, uint32 pitch, Latte::E_HWTILEMODE tileMode, uint32 swizzle, bool dumpTex) { LatteTextureLoaderCtx textureLoader = { 0 }; - + Latte::E_GX2SURFFMT format = tex->format; LatteTextureLoader_begin(&textureLoader, sliceIndex, mipIndex, physImagePtr, physMipPtr, format, dim, width, height, depth, mipLevels, pitch, tileMode, swizzle); @@ -853,7 +853,7 @@ void LatteTextureLoader_writeReadbackTextureToMemory(LatteTextureDefinition* tex pixelInput += 4; } } - } + } else { cemuLog_logDebug(LogType::Force, "Texture readback unsupported format {:04x} for tileMode 0x{:02x}", (uint32)textureData->format, textureData->tileMode); diff --git a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h index f6de57d68..7b2c109b3 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureLoader.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureLoader.h @@ -594,7 +594,7 @@ class TextureDecoder_R4_G4_UNORM_To_RGBA4 : public TextureDecoder, public Single } }; -class TextureDecoder_R4_G4_UNORM_To_RGBA4_vk : public TextureDecoder, public SingletonClass +class TextureDecoder_R4_G4_UNORM_To_ABGR4 : public TextureDecoder, public SingletonClass { public: sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override @@ -679,6 +679,51 @@ class TextureDecoder_R4G4_UNORM_To_RGBA8 : public TextureDecoder, public Singlet } }; +class TextureDecoder_R4G4_UNORM_To_RG8 : public TextureDecoder, public SingletonClass +{ +public: + sint32 getBytesPerTexel(LatteTextureLoaderCtx* textureLoader) override + { + return 2; + } + + void decode(LatteTextureLoaderCtx* textureLoader, uint8* outputData) override + { + for (sint32 y = 0; y < textureLoader->height; y += textureLoader->stepY) + { + sint32 yc = y; + for (sint32 x = 0; x < textureLoader->width; x += textureLoader->stepX) + { + uint8* blockData = LatteTextureLoader_GetInput(textureLoader, x, y); + sint32 pixelOffset = (x + yc * textureLoader->width) * 2; + uint8 v0 = (*(uint8*)(blockData + 0)); + + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + + *(uint8*)(outputData + pixelOffset + 0) = red4; + *(uint8*)(outputData + pixelOffset + 1) = green4; + } + } + } + + void decodePixelToRGBA(uint8* blockData, uint8* outputPixel, uint8 blockOffsetX, uint8 blockOffsetY) override + { + uint8 v0 = *(blockData + 0); + uint8 red4 = (v0 >> 4) & 0xF; + uint8 green4 = (v0 & 0xF); + red4 = (red4 << 4) | red4; + green4 = (green4 << 4) | green4; + *(outputPixel + 0) = red4; + *(outputPixel + 1) = green4; + *(outputPixel + 2) = 0; + *(outputPixel + 3) = 255; + } +}; + class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public SingletonClass { public: @@ -723,7 +768,6 @@ class TextureDecoder_R4_G4_B4_A4_UNORM : public TextureDecoder, public Singleton } }; - class TextureDecoder_R4G4B4A4_UNORM_To_RGBA8 : public TextureDecoder, public SingletonClass { public: @@ -2121,4 +2165,4 @@ class TextureDecoder_BC5 : public TextureDecoder, public SingletonClassinstructionsTEX.emplace_back(texInstruction); } else @@ -1068,9 +1068,16 @@ void _LatteDecompiler_Process(LatteDecompilerShaderContext* shaderContext, uint8 LatteDecompiler_analyzeDataTypes(shaderContext); // emit code if (shaderContext->shader->hasError == false) - LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); + { + if (g_renderer->GetType() == RendererAPI::OpenGL || g_renderer->GetType() == RendererAPI::Vulkan) + LatteDecompiler_emitGLSLShader(shaderContext, shaderContext->shader); +#if ENABLE_METAL + else + LatteDecompiler_emitMSLShader(shaderContext, shaderContext->shader); +#endif + } LatteDecompiler_cleanup(shaderContext); - // fast access + // fast access _LatteDecompiler_GenerateDataForFastAccess(shaderContext->shader); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index 1159614e5..475bacb0c 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -36,7 +36,7 @@ typedef struct uint16 mappedIndexOffset; // index in remapped uniform array }LatteFastAccessRemappedUniformEntry_buffer_t; -typedef struct +typedef struct { uint32 texUnit; sint32 uniformLocation; @@ -57,12 +57,16 @@ struct LatteDecompilerShaderResourceMapping // texture sint8 textureUnitToBindingPoint[LATTE_NUM_MAX_TEX_UNITS]; // uniform buffer - sint8 uniformVarsBufferBindingPoint{}; // special block for uniform registers/remapped array/custom variables + sint8 uniformVarsBufferBindingPoint{-1}; // special block for uniform registers/remapped array/custom variables sint8 uniformBuffersBindingPoint[LATTE_NUM_MAX_UNIFORM_BUFFERS]; // shader storage buffer for transform feedback (if alternative mode is used) sint8 tfStorageBindingPoint{-1}; // attributes (vertex shader only) sint8 attributeMapping[LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS]; + // Metal exclusive + sint8 verticesPerInstanceBinding{-1}; + sint8 indexBufferBinding{-1}; + sint8 indexTypeBinding{-1}; sint32 getTextureCount() { @@ -179,9 +183,12 @@ struct LatteDecompilerShader std::bitset textureUnitMask2; uint16 textureUnitSamplerAssignment[LATTE_NUM_MAX_TEX_UNITS]{ 0 }; // LATTE_DECOMPILER_SAMPLER_NONE means undefined bool textureUsesDepthCompare[LATTE_NUM_MAX_TEX_UNITS]{}; + uint8 textureRenderTargetIndex[LATTE_NUM_MAX_TEX_UNITS]; // analyzer stage (pixel outputs) uint32 pixelColorOutputMask{ 0 }; // from LSB to MSB, 1 bit per written output. 1 if written (indices of color attachments) + // analyzer stage (depth output) + bool depthMask{ false }; // analyzer stage (geometry shader parameters/inputs) uint32 ringParameterCount{ 0 }; uint32 ringParameterCountFromPrevStage{ 0 }; // used in geometry shader to hold VS ringParameterCount @@ -198,7 +205,7 @@ struct LatteDecompilerShader // resource mapping (binding points) LatteDecompilerShaderResourceMapping resourceMapping{}; // uniforms - struct + struct { sint32 loc_remapped; // uf_remappedVS/uf_remappedGS/uf_remappedPS sint32 loc_uniformRegister; // uf_uniformRegisterVS/uf_uniformRegisterGS/uf_uniformRegisterPS @@ -215,7 +222,7 @@ struct LatteDecompilerShader sint32 uniformRangeSize; // entire size of uniform variable block }uniform{ 0 }; // fast access - struct _RemappedUniformBufferGroup + struct _RemappedUniformBufferGroup { _RemappedUniformBufferGroup(uint32 _kcacheBankIdOffset) : kcacheBankIdOffset(_kcacheBankIdOffset) {}; @@ -255,14 +262,14 @@ struct LatteDecompilerOutputUniformOffsets } }; -struct LatteDecompilerOptions +struct LatteDecompilerOptions { bool usesGeometryShader{ false }; // floating point math bool strictMul{}; // if true, 0*anything=0 rule is emulated // Vulkan-specific bool useTFViaSSBO{ false }; - struct + struct { bool hasRoundingModeRTEFloat32{ false }; }spirvInstrinsics; @@ -286,6 +293,7 @@ struct LatteDecompilerOutput_t // mapping and binding information LatteDecompilerShaderResourceMapping resourceMappingGL; LatteDecompilerShaderResourceMapping resourceMappingVK; + LatteDecompilerShaderResourceMapping resourceMappingMTL; }; struct LatteDecompilerSubroutineInfo; @@ -322,4 +330,4 @@ struct LatteParsedGSCopyShader }; LatteParsedGSCopyShader* LatteGSCopyShaderParser_parse(uint8* programData, uint32 programSize); -bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); \ No newline at end of file +bool LatteGSCopyShaderParser_getExportTypeByOffset(LatteParsedGSCopyShader* shaderContext, uint32 offset, uint32* exportType, uint32* exportParam); diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index ff64988c2..ce3203166 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -8,6 +8,11 @@ #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/MemPtr.h" +#include "HW/Latte/ISA/LatteReg.h" + +// Defined in LatteTextureLegacy.cpp +Latte::E_GX2SURFFMT LatteTexture_ReconstructGX2Format(const Latte::LATTE_SQ_TEX_RESOURCE_WORD1_N& texUnitWord1, const Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N& texUnitWord4); /* * Return index of used color attachment based on shader pixel export index (0-7) @@ -289,15 +294,15 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex LatteDecompilerShader* shader = shaderContext->shader; for(auto& texInstruction : cfInstruction->instructionsTEX) { - if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || - texInstruction.opcode == GPU7_TEX_INST_FETCH4 || - texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || + texInstruction.opcode == GPU7_TEX_INST_FETCH4 || + texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) { if (texInstruction.textureFetch.textureIndex < 0 || texInstruction.textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) @@ -315,7 +320,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] = texInstruction.textureFetch.samplerIndex; if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ) shader->textureUsesDepthCompare[texInstruction.textureFetch.textureIndex] = true; - + bool useTexelCoords = false; if (texInstruction.opcode == GPU7_TEX_INST_SAMPLE && (texInstruction.textureFetch.unnormalized[0] && texInstruction.textureFetch.unnormalized[1] && texInstruction.textureFetch.unnormalized[2] && texInstruction.textureFetch.unnormalized[3])) useTexelCoords = true; @@ -384,7 +389,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader = shaderContext->shader; if( shader->shaderType == LatteConst::ShaderType::Pixel ) { - if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8) { // remember color outputs that are written for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) @@ -393,9 +398,10 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, shader->pixelColorOutputMask |= (1<exportType == 0 && cfInstruction->exportArrayBase == 61 ) + else if (cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61) { - // writes pixel depth + if (LatteMRT::GetActiveDepthBufferMask(*shaderContext->contextRegistersNew)) + shader->depthMask = true; } else debugBreakpoint(); @@ -421,7 +427,7 @@ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, void LatteDecompiler_analyzeSubroutine(LatteDecompilerShaderContext* shaderContext, uint32 cfAddr) { // analyze CF and clauses up to RET statement - + // todo - find cfInstruction index from cfAddr cemu_assert_debug(false); @@ -500,6 +506,18 @@ namespace LatteDecompiler } } + void _initTextureBindingPointsMTL(LatteDecompilerShaderContext* decompilerContext) + { + // for Vulkan we use consecutive indices + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!decompilerContext->output->textureUnitMask[i] || decompilerContext->shader->textureRenderTargetIndex[i] != 255) + continue; + decompilerContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] = decompilerContext->currentTextureBindingPointMTL; + decompilerContext->currentTextureBindingPointMTL++; + } + } + void _initHasUniformVarBlock(LatteDecompilerShaderContext* decompilerContext) { decompilerContext->hasUniformVarBlock = false; @@ -507,9 +525,9 @@ namespace LatteDecompiler decompilerContext->hasUniformVarBlock = true; else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) decompilerContext->hasUniformVarBlock = true; - - bool hasAnyViewportScaleDisabled = - !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); // we currently only support all on/off. Individual component scaling is not supported @@ -537,6 +555,13 @@ namespace LatteDecompiler { decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance and uf_streamoutBufferBase* } + if (g_renderer->GetType() == RendererAPI::Metal) + { + bool isRectVertexShader = (static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && (decompilerContext->options->usesGeometryShader || isRectVertexShader)) + decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance + } } void _initUniformBindingPoints(LatteDecompilerShaderContext* decompilerContext) @@ -554,14 +579,13 @@ namespace LatteDecompiler } } // assign binding point to uniform var block - decompilerContext->output->resourceMappingGL.uniformVarsBufferBindingPoint = -1; // OpenGL currently doesnt use a uniform block if (decompilerContext->hasUniformVarBlock) { decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } - else - decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = -1; // assign binding points to uniform buffers if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { @@ -580,6 +604,8 @@ namespace LatteDecompiler decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } // for OpenGL we use the relative buffer index for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) @@ -601,6 +627,8 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingVK.tfStorageBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; + decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint = decompilerContext->currentBufferBindingPointMTL; + decompilerContext->currentBufferBindingPointMTL++; } } @@ -617,6 +645,7 @@ namespace LatteDecompiler { decompilerContext->output->resourceMappingGL.attributeMapping[i] = bindingIndex; decompilerContext->output->resourceMappingVK.attributeMapping[i] = bindingIndex; + decompilerContext->output->resourceMappingMTL.attributeMapping[i] = bindingIndex; bindingIndex++; } } @@ -805,7 +834,7 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD for(sint32 i=0; ioutput->textureUnitMask[i]) + if (!shaderContext->output->textureUnitMask[i]) { // texture unit not used shader->textureUnitDim[i] = (Latte::E_DIM)0xFF; @@ -827,6 +856,81 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shader->textureUnitList[shader->textureUnitListCount] = i; shader->textureUnitListCount++; } + shader->textureRenderTargetIndex[i] = 255; + } + // check if textures are used as render targets + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + struct { + sint32 index; + MPTR physAddr; + Latte::E_GX2SURFFMT format; + Latte::E_HWTILEMODE tileMode; + } colorBuffers[LATTE_NUM_COLOR_TARGET]{}; + + uint8 colorBufferMask = LatteMRT::GetActiveColorBufferMask(shader, *shaderContext->contextRegistersNew); + sint32 colorBufferCount = 0; + for (sint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto& colorBuffer = colorBuffers[colorBufferCount]; + if (((colorBufferMask) & (1 << i)) == 0) + continue; // color buffer not enabled + + uint32* colorBufferRegBase = shaderContext->contextRegisters + (mmCB_COLOR0_BASE + i); + uint32 regColorBufferBase = colorBufferRegBase[mmCB_COLOR0_BASE - mmCB_COLOR0_BASE] & 0xFFFFFF00; // the low 8 bits are ignored? How to Survive seems to rely on this + + uint32 regColorInfo = colorBufferRegBase[mmCB_COLOR0_INFO - mmCB_COLOR0_BASE]; + + MPTR colorBufferPhysMem = regColorBufferBase; + Latte::E_HWTILEMODE colorBufferTileMode = (Latte::E_HWTILEMODE)((regColorInfo >> 8) & 0xF); + + Latte::E_GX2SURFFMT colorBufferFormat = LatteMRT::GetColorBufferFormat(i, *shaderContext->contextRegistersNew); + + colorBuffer = {i, colorBufferPhysMem, colorBufferFormat, colorBufferTileMode}; + colorBufferCount++; + } + + for (sint32 i = 0; i < shader->textureUnitListCount; i++) + { + sint32 textureIndex = shader->textureUnitList[i]; + const auto& texRegister = texRegs[textureIndex]; + + // get physical address of texture data + MPTR physAddr = (texRegister.word2.get_BASE_ADDRESS() << 8); + if (physAddr == MPTR_NULL) + continue; // invalid data + + auto tileMode = texRegister.word0.get_TILE_MODE(); + + // Check for dimension + auto dim = shader->textureUnitDim[textureIndex]; + // TODO: 2D arrays could technically be supported as well + if (dim != Latte::E_DIM::DIM_2D) + continue; + + // Check for mip level + // TODO: uncomment? + /* + auto lastMip = texRegister.word5.get_LAST_LEVEL(); + // TODO: multiple mip levels could technically be supported as well + if (lastMip != 0) + continue; + */ + + Latte::E_GX2SURFFMT format = LatteTexture_ReconstructGX2Format(texRegister.word1, texRegister.word4); + + // Check if the texture is used as render target + for (sint32 j = 0; j < colorBufferCount; j++) + { + const auto& colorBuffer = colorBuffers[j]; + + if (physAddr == colorBuffer.physAddr && format == colorBuffer.format && tileMode == colorBuffer.tileMode) + { + shader->textureRenderTargetIndex[textureIndex] = colorBuffer.index; + break; + } + } + } } // for geometry shaders check the copy shader for stream writes if (shader->shaderType == LatteConst::ShaderType::Geometry && shaderContext->parsedGSCopyShader->list_streamWrites.empty() == false) @@ -1002,6 +1106,10 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD shaderContext->output->resourceMappingVK.setIndex = 2; LatteDecompiler::_initTextureBindingPointsGL(shaderContext); LatteDecompiler::_initTextureBindingPointsVK(shaderContext); + LatteDecompiler::_initTextureBindingPointsMTL(shaderContext); LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); + shaderContext->output->resourceMappingMTL.verticesPerInstanceBinding = shaderContext->currentBufferBindingPointMTL++; + shaderContext->output->resourceMappingMTL.indexBufferBinding = shaderContext->currentBufferBindingPointMTL++; + shaderContext->output->resourceMappingMTL.indexTypeBinding = shaderContext->currentBufferBindingPointMTL++; } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp new file mode 100644 index 000000000..c4b50db12 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -0,0 +1,4458 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "config/ActiveSettings.h" +#include "util/helpers/StringBuf.h" + +#include +#include + +#define _CRLF "\r\n" + +static bool rasterizationEnabled; + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib); + +/* + * Variable names: + * R0-R127 temp + * Most variables are multi-typed and the respective type is appended to the name + * Type suffixes are: f (float), i (32bit int), ui (unsigned 32bit int) + * Examples: R13ui.x, tempf.z + */ + +// local prototypes +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount = 1); +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType); +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine); + +static const char* _getElementStrByIndex(uint32 channel) +{ + switch (channel) + { + case 0: + return "x"; + case 1: + return "y"; + case 2: + return "z"; + case 3: + return "w"; + } + return "UNDEFINED"; +} + +static char _tempGenString[64][256]; +static uint32 _tempGenStringIndex = 0; + +static char* _getTempString() +{ + char* str = _tempGenString[_tempGenStringIndex]; + _tempGenStringIndex = (_tempGenStringIndex+1)%64; + return str; +} + +static char* _getActiveMaskVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStack[%d]", index); + return varName; +} + +static char* _getActiveMaskCVarName(LatteDecompilerShaderContext* shaderContext, sint32 index) +{ + char* varName = _getTempString(); + if (shaderContext->isSubroutine) + sprintf(varName, "activeMaskStackCSub%04x[%d]", shaderContext->subroutineInfo->cfAddr, index); + else + sprintf(varName, "activeMaskStackC[%d]", index); + return varName; +} + +static char* _getRegisterVarName(LatteDecompilerShaderContext* shaderContext, uint32 index, sint32 destRelIndexMode=-1) +{ + auto type = shaderContext->typeTracker.defaultDataType; + char* tempStr = _getTempString(); + if (shaderContext->typeTracker.useArrayGPRs == false) + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + sprintf(tempStr, "R%di", index); + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + sprintf(tempStr, "R%df", index); + } + else + { + char destRelOffset[32]; + if (destRelIndexMode >= 0) + { + if (destRelIndexMode == GPU7_INDEX_AR_X) + strcpy(destRelOffset, "ARi.x"); + else if (destRelIndexMode == GPU7_INDEX_AR_Y) + strcpy(destRelOffset, "ARi.y"); + else if (destRelIndexMode == GPU7_INDEX_AR_Z) + strcpy(destRelOffset, "ARi.z"); + else if (destRelIndexMode == GPU7_INDEX_AR_W) + strcpy(destRelOffset, "ARi.w"); + else + debugBreakpoint(); + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d+%s]", index, destRelOffset); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d+%s]", index, destRelOffset); + } + } + else + { + if (type == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + sprintf(tempStr, "Ri[%d]", index); + } + else if (type == LATTE_DECOMPILER_DTYPE_FLOAT) + { + sprintf(tempStr, "Rf[%d]", index); + } + } + } + return tempStr; +} + +static void _appendRegisterTypeSuffix(StringBuf* src, sint32 dataType) +{ + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("i"); + else if (dataType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("ui"); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add("f"); + else + cemu_assert_unimplemented(); +} + +// appends x/y/z/w +static void _appendChannel(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add("x"); + return; + case 1: + src->add("y"); + return; + case 2: + src->add("z"); + return; + case 3: + src->add("w"); + return; + } +} + +// appends .x/.y/.z/.w +static void _appendChannelAccess(StringBuf* src, sint32 channelIndex) +{ + cemu_assert_debug(channelIndex >= 0 && channelIndex <= 3); + switch (channelIndex) + { + case 0: + src->add(".x"); + return; + case 1: + src->add(".y"); + return; + case 2: + src->add(".z"); + return; + case 3: + src->add(".w"); + return; + } +} + +static void _appendPVPS(LatteDecompilerShaderContext* shaderContext, StringBuf* src, uint32 groupIndex, uint8 aluUnit) +{ + cemu_assert_debug(aluUnit < 5); + if (aluUnit == 4) + { + src->addFmt("PS{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + return; + } + src->addFmt("PV{}", (groupIndex & 1)); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + _appendChannel(src, aluUnit); +} + +std::string _FormatFloatAsConstant(float f) +{ + char floatAsStr[64]; + size_t floatAsStrLen = fmt::format_to_n(floatAsStr, 64, "{:#}", f).size; + size_t floatAsStrLenOrg = floatAsStrLen; + if(floatAsStrLen > 0 && floatAsStr[floatAsStrLen-1] == '.') + { + floatAsStr[floatAsStrLen] = '0'; + floatAsStrLen++; + } + cemu_assert(floatAsStrLen < 50); // constant suspiciously long? + floatAsStr[floatAsStrLen] = '\0'; + cemu_assert_debug(floatAsStrLen >= 3); // shortest possible form is "0.0" + return floatAsStr; +} + +// tracks PV/PS and register backups +struct ALUClauseTemporariesState +{ + struct PVPSAlias + { + enum class LOCATION_TYPE : uint8 + { + LOCATION_NONE, + LOCATION_GPR, + LOCATION_PVPS, + }; + + LOCATION_TYPE location{ LOCATION_TYPE::LOCATION_NONE }; + uint8 index; // GPR index or temporary index + uint8 aluUnit; // x,y,z,w (or 5 for PS) + + void SetLocationGPR(uint8 gprIndex, uint8 channel) + { + cemu_assert_debug(channel < 4); + this->location = LOCATION_TYPE::LOCATION_GPR; + this->index = gprIndex; + this->aluUnit = channel; + } + + void SetLocationPSPVTemporary(uint8 aluUnit, uint32 groupIndex) + { + cemu_assert_debug(aluUnit < 5); + this->location = LOCATION_TYPE::LOCATION_PVPS; + this->index = groupIndex & 1; + this->aluUnit = aluUnit; + } + }; + + struct GPRTemporary + { + GPRTemporary(uint8 gprIndex, uint8 channel, uint8 backupVarIndex) : gprIndex(gprIndex), channel(channel), backupVarIndex(backupVarIndex) {} + + uint8 gprIndex; + uint8 channel; + uint8 backupVarIndex; + }; + + void TrackGroupOutputPVPS(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstr, size_t numInstr) + { + // unset current + for (auto& it : m_pvps) + it.location = PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + for (size_t i = 0; i < numInstr; i++) + { + LatteDecompilerALUInstruction& inst = aluInstr[i]; + if (!inst.isOP3 && inst.opcode == ALU_OP2_INST_NOP) + continue; // skip NOP instruction + + if (inst.writeMask == 0) + { + // map to temporary + m_pvps[inst.aluUnit].SetLocationPSPVTemporary(inst.aluUnit, aluInstr->instructionGroupIndex); + } + else + { + // map to GPR + if(inst.destRel == 0) // is PV/PS set for indexed writes? + m_pvps[inst.aluUnit].SetLocationGPR(inst.destGpr, inst.destElem); + } + } + } + + bool HasPVPS(uint8 aluUnitIndex) const + { + cemu_assert_debug(aluUnitIndex < 5); + return m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_NONE; + } + + void EmitPVPSAccess(LatteDecompilerShaderContext* shaderContext, uint8 aluUnitIndex, uint32 currentGroupIndex) const + { + switch (m_pvps[aluUnitIndex].location) + { + case PVPSAlias::LOCATION_TYPE::LOCATION_GPR: + { + sint32 temporaryIndex = GetTemporaryForGPR(m_pvps[aluUnitIndex].index, m_pvps[aluUnitIndex].aluUnit); + if (temporaryIndex < 0) + { + shaderContext->shaderSource->add(_getRegisterVarName(shaderContext, m_pvps[aluUnitIndex].index, -1)); + _appendChannelAccess(shaderContext->shaderSource, m_pvps[aluUnitIndex].aluUnit); + } + else + { + // use temporary instead of GPR + shaderContext->shaderSource->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(shaderContext->shaderSource, shaderContext->typeTracker.defaultDataType); + } + break; + } + case PVPSAlias::LOCATION_TYPE::LOCATION_PVPS: + _appendPVPS(shaderContext, shaderContext->shaderSource, currentGroupIndex-1, m_pvps[aluUnitIndex].aluUnit); + break; + default: + cemuLog_log(LogType::Force, "Shader {:016x} accesses PV/PS without writing to it", shaderContext->shaderBaseHash); + cemu_assert_suspicious(); + break; + } + } + + /* + * Check for GPR channels which are modified before they are read within the same group + * These registers need to be copied to a temporary + */ + void CreateGPRTemporaries(LatteDecompilerShaderContext* shaderContext, std::span aluInstructions) + { + uint8 registerChannelWriteMask[(LATTE_NUM_GPR * 4 + 7) / 8] = { 0 }; + + m_gprTemporaries.clear(); + for (auto& aluInstruction : aluInstructions) + { + // ignore NOP instructions + if (aluInstruction.isOP3 == false && aluInstruction.opcode == ALU_OP2_INST_NOP) + continue; + cemu_assert_debug(aluInstruction.destElem <= 3); + // check if any previously written register is read + for (sint32 f = 0; f < 3; f++) + { + uint32 readGPRIndex; + uint32 readGPRChannel; + if (GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel)) + { + readGPRIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); + cemu_assert_debug(aluInstruction.sourceOperand[f].chan <= 3); + readGPRChannel = aluInstruction.sourceOperand[f].chan; + } + else if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel) || GPU7_ALU_SRC_IS_PS(aluInstruction.sourceOperand[f].sel)) + { + uint8 aluUnitIndex = 0; + if (GPU7_ALU_SRC_IS_PV(aluInstruction.sourceOperand[f].sel)) + aluUnitIndex = aluInstruction.sourceOperand[f].chan; + else + aluUnitIndex = 4; + // if aliased to a GPR, then consider it a GPR read + if(m_pvps[aluUnitIndex].location != PVPSAlias::LOCATION_TYPE::LOCATION_GPR) + continue; + readGPRIndex = m_pvps[aluUnitIndex].index; + readGPRChannel = m_pvps[aluUnitIndex].aluUnit; + } + else + continue; + // track GPR read + if ((registerChannelWriteMask[(readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) / 8] & (1 << ((readGPRIndex * 4 + aluInstruction.sourceOperand[f].chan) % 8))) != 0) + { + // register is overwritten by previous instruction, a temporary variable is required + if (GetTemporaryForGPR(readGPRIndex, readGPRChannel) < 0) + m_gprTemporaries.emplace_back(readGPRIndex, readGPRChannel, m_gprTemporaries.size()); + } + } + // track write + if (aluInstruction.writeMask != 0) + registerChannelWriteMask[(aluInstruction.destGpr * 4 + aluInstruction.destElem) / 8] |= (1 << ((aluInstruction.destGpr * 4 + aluInstruction.destElem) % 8)); + } + // output code to move GPRs into temporaries + StringBuf* src = shaderContext->shaderSource; + for (auto& it : m_gprTemporaries) + { + src->addFmt("backupReg{}", it.backupVarIndex); + _appendRegisterTypeSuffix(src, shaderContext->typeTracker.defaultDataType); + src->add(" = "); + src->add(_getRegisterVarName(shaderContext, it.gprIndex)); + _appendChannelAccess(src, it.channel); + src->add(";" _CRLF); + } + } + + // returns -1 if none present + sint32 GetTemporaryForGPR(uint8 gprIndex, uint8 channel) const + { + for (auto& it : m_gprTemporaries) + { + if (it.gprIndex == gprIndex && it.channel == channel) + return (sint32)it.backupVarIndex; + } + return -1; + } + +private: + PVPSAlias m_pvps[5]{}; + boost::container::small_vector m_gprTemporaries; +}; + +sint32 _getVertexShaderOutParamSemanticId(uint32* contextRegisters, sint32 index); +sint32 _getInputRegisterDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex); +sint32 _getALUInstructionOutputDataType(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction); +bool _isReductionInstruction(LatteDecompilerALUInstruction* aluInstruction); + +/* + * Writes the name of the output variable and channel + * E.g. R5f.x or tempf.x if writeMask is 0 + */ +static void _emitInstructionOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + auto src = shaderContext->shaderSource; + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + if( aluInstruction->writeMask == 0 ) + { + // does not output to GPR + if( !_isReductionInstruction(aluInstruction) ) + { + // output to PV/PS + _appendPVPS(shaderContext, src, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); + return; + } + else + { + // output to temp + src->add("temp"); + _appendRegisterTypeSuffix(src, outputDataType); + } + _appendChannelAccess(src, aluInstruction->aluUnit); + } + else + { + // output to GPR. Aliasing to PV/PS happens at the end of the group + src->add(_getRegisterVarName(shaderContext, aluInstruction->destGpr, aluInstruction->destRel==0?-1:aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->destElem); + } +} + +static void _emitInstructionPVPSOutputVariableName(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction) +{ + _appendPVPS(shaderContext, shaderContext->shaderSource, aluInstruction->instructionGroupIndex, aluInstruction->aluUnit); +} + +static void _emitRegisterAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel0, sint32 channel1, sint32 channel2, sint32 channel3, sint32 dataType = -1) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + + sint32 channelArray[4]; + channelArray[0] = channel0; + channelArray[1] = channel1; + channelArray[2] = channel2; + channelArray[3] = channel3; + + sint32 numComponents = 0; + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + numComponents++; + } + + if (dataType >= 0) + { + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType, numComponents); + } + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + + src->add("."); + + for (sint32 i = 0; i < 4; i++) + { + if (channelArray[i] >= 0 && channelArray[i] <= 3) + src->add(_getElementStrByIndex(channelArray[i])); + else if (channelArray[i] == -1) + { + // channel not used + } + else + { + cemu_assert_unimplemented(); + } + } + if (dataType >= 0) + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +// optimized variant of _emitRegisterAccessCode for raw one channel reads +static void _emitRegisterChannelAccessCode(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, sint32 channel, sint32 dataType) +{ + cemu_assert_debug(gprIndex >= 0 && gprIndex <= 127); + cemu_assert_debug(channel >= 0 && channel < 4); + StringBuf* src = shaderContext->shaderSource; + sint32 registerElementDataType = shaderContext->typeTracker.defaultDataType; + _emitTypeConversionPrefixMSL(shaderContext, registerElementDataType, dataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->add("R"); + else + src->addFmt("R{}", gprIndex); + _appendRegisterTypeSuffix(src, registerElementDataType); + if (shaderContext->typeTracker.useArrayGPRs) + src->addFmt("[{}]", gprIndex); + src->add("."); + src->add(_getElementStrByIndex(channel)); + _emitTypeConversionSuffixMSL(shaderContext, registerElementDataType, dataType); +} + +static void _emitALURegisterInputAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + cemu_assert_debug(GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel)); + sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + sint32 temporaryIndex = shaderContext->aluPVPSState->GetTemporaryForGPR(gprIndex, aluInstruction->sourceOperand[operandIndex].chan); + if(temporaryIndex >= 0) + { + // access via backup variable + src->addFmt("backupReg{}", temporaryIndex); + _appendRegisterTypeSuffix(src, currentRegisterElementType); + } + else + { + // access via register variable + _emitRegisterAccessCode(shaderContext, gprIndex, aluInstruction->sourceOperand[operandIndex].chan, -1, -1, -1); + } +} + +static void _emitPVPSAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, uint8 aluUnitIndex) +{ + cemu_assert_debug(aluInstruction->instructionGroupIndex > 0); // PV/PS is uninitialized for group 0 + // PV/PS vars are currently always using the default type (shaderContext->typeTracker.defaultDataType) + shaderContext->aluPVPSState->EmitPVPSAccess(shaderContext, aluUnitIndex, aluInstruction->instructionGroupIndex); +} + +/* + * Emits the expression used for calculating the index for uniform access + * For static access, this is a number + * For dynamic access, this is AR.* + base + */ +static void _emitUniformAccessIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex) +{ + StringBuf* src = shaderContext->shaderSource; + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array, for relative accesses this is the base offset + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + } + } + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + src->addFmt("ARi.x+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + src->addFmt("ARi.y+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + src->addFmt("ARi.z+{}", uniformOffset); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + src->addFmt("ARi.w+{}", uniformOffset); + else + cemu_assert_unimplemented(); + } + else + { + src->addFmt("{}", uniformOffset); + } +} + +static void _emitUniformAccessCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if(shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED ) + { + // uniform registers or buffers are accessed statically with predictable offsets + // find entry in remapped uniform + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + debugBreakpoint(); + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + sint32 uniformOffset = 0; // index into array + sint32 uniformBufferIndex = 0; + if( isUniformRegister ) + { + uniformOffset = GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + uniformBufferIndex = 0; + } + else + { + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank0AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformOffset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction->sourceOperand[operandIndex].sel) + aluInstruction->cfInstruction->cBank1AddrBase; + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + } + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntry = NULL; + for(size_t i=0; i< shaderContext->shader->list_remappedUniformEntries.size(); i++) + { + LatteDecompilerRemappedUniformEntry_t* remappedUniformEntryItr = shaderContext->shader->list_remappedUniformEntries.data() + i; + if( remappedUniformEntryItr->isRegister && isUniformRegister ) + { + if( remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + else + { + if( remappedUniformEntryItr->kcacheBankId == uniformBufferIndex && remappedUniformEntryItr->index == uniformOffset ) + { + remappedUniformEntry = remappedUniformEntryItr; + break; + } + } + } + cemu_assert_debug(remappedUniformEntry); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + src->addFmt("supportBuffer.remapped[{}]", remappedUniformEntry->mappedIndex); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE ) + { + // uniform registers are accessed with unpredictable (dynamic) offset + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + src->add("supportBuffer.uniformRegister["); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->add("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + } + else if( shaderContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK ) + { + // uniform buffers are available as a whole + bool isUniformRegister = GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel); + if( isUniformRegister ) + debugBreakpoint(); + sint32 uniformBufferIndex = 0; + if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) ) + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank0Index; + } + else + { + uniformBufferIndex = aluInstruction->cfInstruction->cBank1Index; + } + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->addFmt("ubuff{}.d[", uniformBufferIndex); + _emitUniformAccessIndexCode(shaderContext, aluInstruction, operandIndex); + src->addFmt("]"); + + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else + debugBreakpoint(); +} + +// Generates (slow) code to read an indexed GPR +static void _emitCodeToReadRelativeGPR(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 gprBaseIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + cemu_assert_debug(aluInstruction->sourceOperand[operandIndex].rel != 0); + + if( shaderContext->typeTracker.useArrayGPRs ) + { + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + src->add(_getRegisterVarName(shaderContext, gprBaseIndex, aluInstruction->indexMode)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, requiredType); + return; + } + + char indexAccessCode[64]; + if (aluInstruction->indexMode == GPU7_INDEX_AR_X) + sprintf(indexAccessCode, "ARi.x"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Y) + sprintf(indexAccessCode, "ARi.y"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_Z) + sprintf(indexAccessCode, "ARi.z"); + else if (aluInstruction->indexMode == GPU7_INDEX_AR_W) + sprintf(indexAccessCode, "ARi.w"); + else + cemu_assert_unimplemented(); + + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); + + // generated code looks like this: + // result = ((lookupIndex==0)?GPR5:(lookupIndex==1)?GPR6:(lookupIndex==2)?GPR7:...:(lookupIndex==122)?GPR127:0) + src->add("("); + for(sint32 i=gprBaseIndex; ianalyzer.gprUseMask[i / 8] & (1 << (i % 8))) == 0 ) + continue; + src->addFmt("({}=={})?", indexAccessCode, i-gprBaseIndex); + // code to access gpr + uint32 gprIndex = i; + src->add(_getRegisterVarName(shaderContext, i)); + _appendChannelAccess(src, aluInstruction->sourceOperand[operandIndex].chan); + src->add(":"); + } + src->add("0)"); + if( LATTE_DECOMPILER_DTYPE_SIGNED_INT != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, requiredType); +} + +static void _emitOperandInputCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, sint32 operandIndex, sint32 requiredType) +{ + StringBuf* src = shaderContext->shaderSource; + if( operandIndex < 0 || operandIndex >= 3 ) + debugBreakpoint(); + sint32 requiredTypeOut = requiredType; + if( requiredType != LATTE_DECOMPILER_DTYPE_FLOAT && (aluInstruction->sourceOperand[operandIndex].abs != 0 || aluInstruction->sourceOperand[operandIndex].neg != 0) ) + { + // we need to apply float operations on the input but it's not read as a float + // force internal required type to float and then cast it back to whatever type is actually required + requiredType = LATTE_DECOMPILER_DTYPE_FLOAT; + } + + if( requiredTypeOut != requiredType ) + _emitTypeConversionPrefixMSL(shaderContext, requiredType, requiredTypeOut); + + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add("-("); + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add("abs("); + + if( GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( aluInstruction->sourceOperand[operandIndex].rel != 0 ) + { + _emitCodeToReadRelativeGPR(shaderContext, aluInstruction, operandIndex, requiredType); + } + else + { + uint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction->sourceOperand[operandIndex].sel); + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // signed int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + // write code for register input + _emitTypeConversionPrefixMSL(shaderContext, currentRegisterElementType, requiredType); + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionSuffixMSL(shaderContext, currentRegisterElementType, requiredType); + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // unsigned int 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert from int to uint + src->add("uint("); + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // float 32bit + sint32 currentRegisterElementType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + // need to convert (not cast) from int bits to float + src->add("as_type("); // TODO: correct? + } + else if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + // no extra work necessary + } + else + debugBreakpoint(); + // write code for register input + _emitALURegisterInputAccessCode(shaderContext, aluInstruction, operandIndex); + if( currentRegisterElementType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + src->add(")"); + } + } + else + debugBreakpoint(); + } + } + else if( GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if(requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("0"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add("0.0"); + } + else if( GPU7_ALU_SRC_IS_CONST_1F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("1.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_0_5F(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + src->add("0.5"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, requiredType); + } + else if( GPU7_ALU_SRC_IS_CONST_1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->add("int(1)"); + else if (requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + src->add("uint(1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_CONST_M1I(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add("int(-1)"); + else + cemu_assert_suspicious(); + } + else if( GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[operandIndex].sel) ) + { + if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->addFmt("int(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if( requiredType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT ) + src->addFmt("uint(0x{:x})", aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]); + else if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + uint32 constVal = aluInstruction->literalData.w[aluInstruction->sourceOperand[operandIndex].chan]; + sint32 exponent = (constVal >> 23) & 0xFF; + exponent -= 127; + if ((constVal & 0xFF) == 0 && exponent >= -10 && exponent <= 10) + { + src->add(_FormatFloatAsConstant(*(float*)&constVal)); + } + else + src->addFmt("as_type(0x{:08x})", constVal); + } + } + else if( GPU7_ALU_SRC_IS_CFILE(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction->sourceOperand[operandIndex].sel) || + GPU7_ALU_SRC_IS_CBANK1(aluInstruction->sourceOperand[operandIndex].sel) ) + { + _emitUniformAccessCode(shaderContext, aluInstruction, operandIndex, requiredType); + } + else if( GPU7_ALU_SRC_IS_PV(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPVDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPVDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, aluInstruction->sourceOperand[operandIndex].chan); + _emitTypeConversionSuffixMSL(shaderContext, currentPVDataType, requiredType); + } + else if( GPU7_ALU_SRC_IS_PS(aluInstruction->sourceOperand[operandIndex].sel) ) + { + sint32 currentPSDataType = _getInputRegisterDataType(shaderContext, aluInstruction, operandIndex); + _emitTypeConversionPrefixMSL(shaderContext, currentPSDataType, requiredType); + _emitPVPSAccessCode(shaderContext, aluInstruction, operandIndex, 4); + _emitTypeConversionSuffixMSL(shaderContext, currentPSDataType, requiredType); + } + else + { + cemuLog_log(LogType::Force, "Unsupported shader ALU operand sel {:#x}\n", aluInstruction->sourceOperand[operandIndex].sel); + debugBreakpoint(); + } + + if( aluInstruction->sourceOperand[operandIndex].abs != 0 ) + src->add(")"); + if( aluInstruction->sourceOperand[operandIndex].neg != 0 ) + src->add(")"); + + if( requiredTypeOut != requiredType ) + _emitTypeConversionSuffixMSL(shaderContext, requiredType, requiredTypeOut); +} + +void _emitTypeConversionPrefixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType, sint32 componentCount) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + if (destinationType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_UNSIGNED_INT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else if (destinationType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (componentCount == 1) + src->add("as_type("); + else + src->addFmt("as_type(", componentCount); + } + else + cemu_assert_debug(false); +} + +void _emitTypeConversionSuffixMSL(LatteDecompilerShaderContext* shaderContext, sint32 sourceType, sint32 destinationType) +{ + if( sourceType == destinationType ) + return; + StringBuf* src = shaderContext->shaderSource; + src->add(")"); +} + +template +static void _emitALUOperationBinary(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluInstruction, const char* operandStr) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, TDataType, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, TDataType); + src->add((char*)operandStr); + _emitOperandInputCode(shaderContext, aluInstruction, 1, TDataType); + _emitTypeConversionSuffixMSL(shaderContext, TDataType, outputType); + src->add(";" _CRLF); +} + +static bool _isSameGPROperand(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndexA, sint32 opIndexB) +{ + if (aluInstruction->sourceOperand[opIndexA].sel != aluInstruction->sourceOperand[opIndexB].sel) + return false; + if (!GPU7_ALU_SRC_IS_GPR(aluInstruction->sourceOperand[opIndexA].sel)) + return false; + if (aluInstruction->sourceOperand[opIndexA].chan != aluInstruction->sourceOperand[opIndexB].chan) + return false; + if (aluInstruction->sourceOperand[opIndexA].abs != aluInstruction->sourceOperand[opIndexB].abs) + return false; + if (aluInstruction->sourceOperand[opIndexA].neg != aluInstruction->sourceOperand[opIndexB].neg) + return false; + if (aluInstruction->sourceOperand[opIndexA].rel != aluInstruction->sourceOperand[opIndexB].rel) + return false; + return true; +} + +static bool _operandHasModifiers(LatteDecompilerALUInstruction* aluInstruction, sint32 opIndex) +{ + return aluInstruction->sourceOperand[opIndex].abs != 0 || aluInstruction->sourceOperand[opIndex].neg != 0; +} + +static void _emitALUOP2InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); // data type of output + if( aluInstruction->opcode == ALU_OP2_INST_MOV ) + { + bool requiresFloatMove = false; + requiresFloatMove = aluInstruction->sourceOperand[0].abs != 0 || aluInstruction->sourceOperand[0].neg != 0; + if( requiresFloatMove ) + { + // abs/neg operations are applied to source operand, do float based move + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, outputType); + src->add(";" _CRLF); + } + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_FLOOR ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResultf = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(";" _CRLF); + src->add("tempResultf = floor(tempResultf);" _CRLF); + src->add("tempResultf = clamp(tempResultf, -256.0, 255.0);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = int(tempResultf);" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = int(tempResultf);" _CRLF); + else + src->add("ARi.w = int(tempResultf);" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("as_type(tempResultf)"); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MOVA_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + cemu_assert_debug(aluInstruction->omod == 0); + src->add("tempResulti = "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(";" _CRLF); + src->add("tempResulti = clamp(tempResulti, -256, 255);" _CRLF); + // set AR + if( aluInstruction->destElem == 0 ) + src->add("ARi.x = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 1 ) + src->add("ARi.y = tempResulti;" _CRLF); + else if( aluInstruction->destElem == 2 ) + src->add("ARi.z = tempResulti;" _CRLF); + else + src->add("ARi.w = tempResulti;" _CRLF); + // set output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + if( outputType != LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + debugBreakpoint(); // todo + src->add("tempResulti"); + src->add(";" _CRLF); + + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD ) + { + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL ) + { + // 0*anything is always 0 + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // if any operand is a non-zero literal or constant we can use standard multiplication + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_CONST_0F(aluInstruction->sourceOperand[1].sel)) + { + // result is always zero + src->add("0.0"); + } + else + { + // multiply + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MUL_IEEE ) + { + // 0*anything according to IEEE rules + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_IEEE) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("1.0"); + src->add(" / "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIP_FF) + { + // untested (BotW bombs) + src->add("tempResultf = 1.0 / ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // INF becomes 0.0 + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + // -INF becomes -0.0 + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_IEEE || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF ) + { + // todo: This should be correct but testing is needed + src->add("tempResultf = 1.0 / sqrt("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_CLAMPED) + { + // note: if( -INF < 0.0 ) does not resolve to true + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -3.40282347E+38F;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 3.40282347E+38F;" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_RECIPSQRT_FF) + { + // untested (BotW bombs) + src->add("if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) != 0 ) tempResultf = -0.0;" _CRLF); + src->add("else if( isinf(tempResultf) == true && (as_type(tempResultf)&0x80000000) == 0 ) tempResultf = 0.0;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_MAX || + aluInstruction->opcode == ALU_OP2_INST_MIN || + aluInstruction->opcode == ALU_OP2_INST_MAX_DX10 || + aluInstruction->opcode == ALU_OP2_INST_MIN_DX10 ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX ) + src->add("max"); + else if( aluInstruction->opcode == ALU_OP2_INST_MIN ) + src->add("min"); + else if (aluInstruction->opcode == ALU_OP2_INST_MAX_DX10) + src->add("max"); + else if (aluInstruction->opcode == ALU_OP2_INST_MIN_DX10) + src->add("min"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLOOR || + aluInstruction->opcode == ALU_OP2_INST_FRACT || + aluInstruction->opcode == ALU_OP2_INST_TRUNC ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_FLOOR ) + src->add("floor"); + else if( aluInstruction->opcode == ALU_OP2_INST_FRACT ) + src->add("fract"); + else + src->add("trunc"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED || + aluInstruction->opcode == ALU_OP2_INST_LOG_IEEE ) + { + src->add("tempResultf = max(0.0, "); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + + src->add("tempResultf = log2(tempResultf);" _CRLF); + if( aluInstruction->opcode == ALU_OP2_INST_LOG_CLAMPED ) + { + src->add("if( isinf(tempResultf) == true ) tempResultf = -3.40282347E+38F;" _CRLF); + } + // assign result to output + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("tempResultf"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_RNDNE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("rint("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_EXP_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("exp2"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SQRT_IEEE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("sqrt"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SIN || + aluInstruction->opcode == ALU_OP2_INST_COS ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_SIN ) + src->add("sin"); + else + src->add("cos"); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")/0.1591549367)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("int"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_FLT_TO_UINT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add("uint"); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_INT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_UINT_TO_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("float("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if (aluInstruction->opcode == ALU_OP2_INST_AND_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " & "); + else if (aluInstruction->opcode == ALU_OP2_INST_OR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " | "); + else if (aluInstruction->opcode == ALU_OP2_INST_XOR_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " ^ "); + else if( aluInstruction->opcode == ALU_OP2_INST_NOT_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("~("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_ADD_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " + "); + else if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MIN_INT || + aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT) + { + // not verified + bool isUnsigned = aluInstruction->opcode == ALU_OP2_INST_MAX_UINT || aluInstruction->opcode == ALU_OP2_INST_MIN_UINT; + auto opType = isUnsigned ? LATTE_DECOMPILER_DTYPE_UNSIGNED_INT : LATTE_DECOMPILER_DTYPE_SIGNED_INT; + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, opType, outputType); + if( aluInstruction->opcode == ALU_OP2_INST_MAX_INT || aluInstruction->opcode == ALU_OP2_INST_MAX_UINT ) + src->add("max("); + else + src->add("min("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, opType); + src->add(", "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, opType); + _emitTypeConversionSuffixMSL(shaderContext, opType, outputType); + src->add(");" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SUB_INT ) + { + // note: The AMD doc says src1 is on the left side but tests indicate otherwise. It's src0 - src1. + _emitALUOperationBinary(shaderContext, aluInstruction, " - "); + } + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_INT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if (aluInstruction->opcode == ALU_OP2_INST_MULLO_UINT) + _emitALUOperationBinary(shaderContext, aluInstruction, " * "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHL_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " << "); + else if( aluInstruction->opcode == ALU_OP2_INST_LSHR_INT ) + _emitALUOperationBinary(shaderContext, aluInstruction, " >> "); + else if( aluInstruction->opcode == ALU_OP2_INST_ASHR_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(" >> "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT || + aluInstruction->opcode == ALU_OP2_INST_SETGE || + aluInstruction->opcode == ALU_OP2_INST_SETNE || + aluInstruction->opcode == ALU_OP2_INST_SETE ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE ) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETNE) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_SETE) + src->add(" == "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?1.0:0.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 || + aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + { + if( aluInstruction->omod != 0 ) + debugBreakpoint(); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_DX10 ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_DX10 ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_DX10 ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_DX10 ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")?-1:0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_INT || + aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETE_INT ) + src->add(" == "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETNE_INT ) + src->add(" != "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_INT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_INT ) + src->add(" >= "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")?-1:0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT || + aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + { + // todo: Unsure if the result is unsigned or signed + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_SETGE_UINT ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_SETGT_UINT ) + src->add(" > "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_UNSIGNED_INT); + src->add(")?int(0xFFFFFFFF):int(0x0)"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT || + aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT ) + { + cemu_assert_debug(aluInstruction->writeMask == 0); + bool isIntPred = (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) || (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT); + + src->add("predResult"); + src->add(" = ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + + if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETGE_INT) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE || aluInstruction->opcode == ALU_OP2_INST_PRED_SETNE_INT) + src->add(" != "); + else + cemu_assert_debug(false); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, isIntPred?LATTE_DECOMPILER_DTYPE_SIGNED_INT:LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(");" _CRLF); + // handle result of predicate instruction based on current ALU clause type + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->addFmt("{} = predResult;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = predResult == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_BREAK ) + { + // leave current loop + src->add("if( predResult == false ) break;" _CRLF); + } + else + cemu_assert_debug(false); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT || + aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLE_INT ) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLNE_INT) + src->add(" != "); + else if (aluInstruction->opcode == ALU_OP2_INST_KILLGT_INT) + src->add(" > "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->add(") discard_fragment();"); + src->add(_CRLF); + } + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGT || + aluInstruction->opcode == ALU_OP2_INST_KILLGE || + aluInstruction->opcode == ALU_OP2_INST_KILLE ) + { + src->add("if( "); + src->add(" ("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if( aluInstruction->opcode == ALU_OP2_INST_KILLGT ) + src->add(" > "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLGE ) + src->add(" >= "); + else if( aluInstruction->opcode == ALU_OP2_INST_KILLE ) + src->add(" == "); + else + debugBreakpoint(); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + src->add(") discard_fragment();"); + src->add(_CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op2 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUOP3InstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, LatteDecompilerALUInstruction* aluInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + cemu_assert_debug(aluInstruction->destRel == 0); // todo + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluInstruction); + + /* check for common no-op or mov-like instructions */ + if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT || + aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || + aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || + aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + if (_isSameGPROperand(aluInstruction, 1, 2) && !_operandHasModifiers(aluInstruction, 1)) + { + // the condition is irrelevant as both operands are the same + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, outputType); + src->add(";" _CRLF); + return; + } + } + + + /* generic handlers */ + if( aluInstruction->opcode == ALU_OP3_INST_MULADD || + aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 || + aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE ) + { + // todo: The difference between MULADD and MULADD IEEE is that the former has 0*anything=0 rule similar to MUL/MUL_IEEE? + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + if (aluInstruction->opcode != ALU_OP3_INST_MULADD) // avoid unnecessary parenthesis to improve code readability slightly + src->add("("); + + bool useDefaultMul = false; + if (GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_LITERAL(aluInstruction->sourceOperand[1].sel) || + GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[0].sel) || GPU7_ALU_SRC_IS_ANY_CONST(aluInstruction->sourceOperand[1].sel)) + { + useDefaultMul = true; + } + if (aluInstruction->opcode == ALU_OP3_INST_MULADD_IEEE) + useDefaultMul = true; + + if (shaderContext->options->strictMul && useDefaultMul == false) + { + src->add("mul_nonIEEE("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else + { + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(" * "); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + } + + src->add(" + "); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + if(aluInstruction->opcode != ALU_OP3_INST_MULADD) + src->add(")"); + if( aluInstruction->opcode == ALU_OP3_INST_MULADD_D2 ) + src->add("/2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M2 ) + src->add("*2.0"); + else if( aluInstruction->opcode == ALU_OP3_INST_MULADD_M4 ) + src->add("*4.0"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if(aluInstruction->opcode == ALU_OP3_INST_CNDE_INT || aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT || aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + { + bool requiresFloatResult = (aluInstruction->sourceOperand[1].neg != 0) || (aluInstruction->sourceOperand[2].neg != 0); + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + if (aluInstruction->opcode == ALU_OP3_INST_CNDE_INT) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CNDGT_INT) + src->add(" > "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE_INT) + src->add(" >= "); + src->add("0)?("); + + _emitOperandInputCode(shaderContext, aluInstruction, 1, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, requiresFloatResult?LATTE_DECOMPILER_DTYPE_FLOAT:LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else if( aluInstruction->opcode == ALU_OP3_INST_CMOVGE || + aluInstruction->opcode == ALU_OP3_INST_CMOVE || + aluInstruction->opcode == ALU_OP3_INST_CMOVGT ) + { + _emitInstructionOutputVariableName(shaderContext, aluInstruction); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("(("); + _emitOperandInputCode(shaderContext, aluInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + if (aluInstruction->opcode == ALU_OP3_INST_CMOVE) + src->add(" == "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGE) + src->add(" >= "); + else if (aluInstruction->opcode == ALU_OP3_INST_CMOVGT) + src->add(" > "); + src->add("0.0)?("); + _emitOperandInputCode(shaderContext, aluInstruction, 1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("):("); + _emitOperandInputCode(shaderContext, aluInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add("))"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + { + src->add("Unsupported instruction;" _CRLF); + debug_printf("Unsupported ALU op3 instruction 0x%x\n", aluInstruction->opcode); + shaderContext->shader->hasError = true; + } +} + +static void _emitALUReductionInstructionCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction* aluRedcInstruction[4]) +{ + StringBuf* src = shaderContext->shaderSource; + if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4 || aluRedcInstruction[0]->opcode == ALU_OP2_INST_DOT4_IEEE) ) + { + // todo: Figure out and implement the difference between normal DOT4 and DOT4_IEEE + sint32 outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + + // dot(float4(op0),float4(op1)) + src->add("dot(float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + } + else if( aluRedcInstruction[0]->isOP3 == false && (aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE) ) + { + /* + * How the CUBE instruction works (guessed mostly, based on DirectX/OpenGL spec): + Input: float4, 3d direction vector (can be unnormalized) + w component (which can be ignored, since it only scales the vector but does not affect the direction) + + First we figure out the major axis (closest axis-aligned vector). There are six possible vectors: + +rx 0 + -rx 1 + +ry 2 + -ry 3 + +rz 4 + -rz 5 + The major axis vector is calculated by looking at the largest (absolute) 3d vector component and then setting the other components to 0.0 + The value that remains in the axis vector is referred to as 'MajorAxis' by the AMD documentation. + The S,T coordinates are taken from the other two components. + Example: -0.5,0.2,0.4 -> -rx -> -0.5,0.0,0.0 MajorAxis: -0.5, S: 0.2 T: 0.4 + + The CUBE reduction instruction requires a specific mapping for the input vector: + src0 = Rn.zzxy + src1 = Rn.yxzz + It's probably related to the way the instruction works internally? + If we look at the individual components per ALU unit: + z y -> Compare y/z + z x -> Compare x/z + x z -> Compare x/z + y z -> Compare y/z + */ + + sint32 outputType; + + src->add("redcCUBE("); + src->add("float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("float4("); + _emitOperandInputCode(shaderContext, aluRedcInstruction[0], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[1], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[2], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluRedcInstruction[3], 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("),"); + src->add("cubeMapSTM,cubeMapFaceId);" _CRLF); + + // dst.X (S) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[0]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.x"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Y (T) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[1]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[1]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.y"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.Z (MajorAxis) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[2]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[2]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add("cubeMapSTM.z"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, outputType); + src->add(";" _CRLF); + // dst.W (FaceId) + outputType = _getALUInstructionOutputDataType(shaderContext, aluRedcInstruction[3]); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[3]); + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add("cubeMapFaceId"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, outputType); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); +} + +static void _emitALUClauseRegisterBackupCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex) +{ + sint32 instructionGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + size_t groupSize = 1; + while ((startIndex + groupSize) < cfInstruction->instructionsALU.size()) + { + if (instructionGroupIndex != cfInstruction->instructionsALU[startIndex + groupSize].instructionGroupIndex) + break; + groupSize++; + } + shaderContext->aluPVPSState->CreateGPRTemporaries(shaderContext, { cfInstruction->instructionsALU.data() + startIndex, groupSize }); +} + +/* +bool _isPVUsedInNextGroup(LatteDecompilerCFInstruction* cfInstruction, sint32 startIndex, sint32 pvUnit) +{ + sint32 currentGroupIndex = cfInstruction->instructionsALU[startIndex].instructionGroupIndex; + for (sint32 i = startIndex + 1; i < (sint32)cfInstruction->instructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstructionItr = cfInstruction->instructionsALU[i]; + if(aluInstructionItr.instructionGroupIndex == currentGroupIndex ) + continue; + if ((sint32)aluInstructionItr.instructionGroupIndex > currentGroupIndex + 1) + return false; + // check OP code type + if (aluInstructionItr.isOP3) + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // op2 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[2].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[2].chan; + if (pvUnit == chan) + return true; + } + } + else + { + // op0 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[0].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[0].chan; + if (pvUnit == chan) + return true; + } + // op1 + if (GPU7_ALU_SRC_IS_PV(aluInstructionItr.sourceOperand[1].sel)) + { + uint32 chan = aluInstructionItr.sourceOperand[1].chan; + if (pvUnit == chan) + return true; + } + // todo: Not all operations use both operands + } + } + return false; +} +*/ + +static void _emitFloat3(LatteDecompilerShaderContext* shaderContext, uint32 dataType, LatteDecompilerALUInstruction* aluInst0, sint32 opIdx0, LatteDecompilerALUInstruction* aluInst1, sint32 opIdx1, LatteDecompilerALUInstruction* aluInst2, sint32 opIdx2) +{ + StringBuf* src = shaderContext->shaderSource; + if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + src->add("float3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + src->add("int3("); + _emitOperandInputCode(shaderContext, aluInst0, opIdx0, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst1, opIdx1, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(","); + _emitOperandInputCode(shaderContext, aluInst2, opIdx2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + } + else + cemu_assert_unimplemented(); +} + +static void _emitGPRVectorAssignment(LatteDecompilerShaderContext* shaderContext, LatteDecompilerALUInstruction** aluInstructions, sint32 count) +{ + StringBuf* src = shaderContext->shaderSource; + // output var name (GPR) + src->add(_getRegisterVarName(shaderContext, aluInstructions[0]->destGpr, -1)); + src->add("."); + for (sint32 f = 0; f < count; f++) + { + src->add(_getElementStrByIndex(aluInstructions[f]->destElem)); + } + src->add(" = "); +} + +static void _emitALUClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + ALUClauseTemporariesState pvpsState; + shaderContext->aluPVPSState = &pvpsState; + StringBuf* src = shaderContext->shaderSource; + LatteDecompilerALUInstruction* aluRedcInstruction[4]; + size_t groupStartIndex = 0; + for(size_t i=0; iinstructionsALU.size(); i++) + { + LatteDecompilerALUInstruction& aluInstruction = cfInstruction->instructionsALU[i]; + if( aluInstruction.indexInGroup == 0 ) + { + src->addFmt("// {}" _CRLF, aluInstruction.instructionGroupIndex); + // apply PV/PS updates for previous group + if (i > 0) + { + pvpsState.TrackGroupOutputPVPS(shaderContext, cfInstruction->instructionsALU.data() + groupStartIndex, i - groupStartIndex); + } + groupStartIndex = i; + // backup registers which are read after being written + _emitALUClauseRegisterBackupCode(shaderContext, cfInstruction, i); + } + // detect reduction instructions and use a special handler + bool isReductionOperation = _isReductionInstruction(&aluInstruction); + if( isReductionOperation ) + { + cemu_assert_debug((i + 4) <= cfInstruction->instructionsALU.size()); + aluRedcInstruction[0] = &aluInstruction; + aluRedcInstruction[1] = &cfInstruction->instructionsALU[i + 1]; + aluRedcInstruction[2] = &cfInstruction->instructionsALU[i + 2]; + aluRedcInstruction[3] = &cfInstruction->instructionsALU[i + 3]; + if( aluRedcInstruction[0]->isOP3 != aluRedcInstruction[1]->isOP3 || aluRedcInstruction[1]->isOP3 != aluRedcInstruction[2]->isOP3 || aluRedcInstruction[2]->isOP3 != aluRedcInstruction[3]->isOP3 ) + debugBreakpoint(); + if( aluRedcInstruction[0]->opcode != aluRedcInstruction[1]->opcode || aluRedcInstruction[1]->opcode != aluRedcInstruction[2]->opcode || aluRedcInstruction[2]->opcode != aluRedcInstruction[3]->opcode ) + debugBreakpoint(); + if( aluRedcInstruction[0]->omod != aluRedcInstruction[1]->omod || aluRedcInstruction[1]->omod != aluRedcInstruction[2]->omod || aluRedcInstruction[2]->omod != aluRedcInstruction[3]->omod ) + debugBreakpoint(); + if( aluRedcInstruction[0]->destClamp != aluRedcInstruction[1]->destClamp || aluRedcInstruction[1]->destClamp != aluRedcInstruction[2]->destClamp || aluRedcInstruction[2]->destClamp != aluRedcInstruction[3]->destClamp ) + debugBreakpoint(); + _emitALUReductionInstructionCode(shaderContext, aluRedcInstruction); + i += 3; // skip the instructions that are part of the reduction operation + } + else /* not a reduction operation */ + { + if( aluInstruction.isOP3 ) + { + // op3 + _emitALUOP3InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + else + { + // op2 + if( aluInstruction.opcode == ALU_OP2_INST_NOP ) + continue; // skip NOP instruction + _emitALUOP2InstructionCode(shaderContext, cfInstruction, &aluInstruction); + } + } + // handle omod + sint32 outputDataType = _getALUInstructionOutputDataType(shaderContext, &aluInstruction); + if( aluInstruction.omod != ALU_OMOD_NONE ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + if( aluInstruction.omod == ALU_OMOD_MUL2 ) + src->add(" *= 2.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_MUL4 ) + src->add(" *= 4.0;" _CRLF); + else if( aluInstruction.omod == ALU_OMOD_DIV2 ) + src->add(" /= 2.0;" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = "); + src->add("as_type(as_type("); // TODO: correct? + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(")"); + if( aluInstruction.omod == 1 ) + src->add(" * 2.0"); + else if( aluInstruction.omod == 2 ) + src->add(" * 4.0"); + else if( aluInstruction.omod == 3 ) + src->add(" / 2.0"); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle clamp + if( aluInstruction.destClamp != 0 ) + { + if( outputDataType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clamp("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(", 0.0, 1.0);" _CRLF); + } + else if( outputDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(" = clampFI32("); + _emitInstructionOutputVariableName(shaderContext, &aluInstruction); + src->add(");" _CRLF); + } + else + { + cemu_assert_unimplemented(); + } + } + // handle result broadcasting for reduction instructions + if( isReductionOperation ) + { + // reduction operations set all four PV components (todo: Needs further research. According to AMD docs, dot4 only sets PV.x? update: Unlike DOT4, CUBE sets all PV elements accordingly to their GPR output?) + if( aluRedcInstruction[0]->opcode == ALU_OP2_INST_CUBE ) + { + // CUBE + for (sint32 f = 0; f < 4; f++) + { + if (aluRedcInstruction[f]->writeMask != 0) + continue; + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + else + { + // DOT4, DOT4_IEEE, etc. + // reduction operation result is only set for output in redc[0], we also need to update redc[1] to redc[3] + for(sint32 f=0; f<4; f++) + { + if( aluRedcInstruction[f]->writeMask == 0 ) + _emitInstructionPVPSOutputVariableName(shaderContext, aluRedcInstruction[f]); + else + { + if (f == 0) + continue; + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[f]); + } + src->add(" = "); + _emitInstructionOutputVariableName(shaderContext, aluRedcInstruction[0]); + src->add(";" _CRLF); + } + } + } + } + shaderContext->aluPVPSState = nullptr; +} + +/* + * Emits code to access one component (xyzw) of the texture coordinate input vector + */ +static void _emitTEXSampleCoordInputComponent(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction, sint32 componentIndex, sint32 interpretSrcAsType) +{ + cemu_assert(componentIndex >= 0 && componentIndex < 4); + cemu_assert_debug(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT || interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT); + StringBuf* src = shaderContext->shaderSource; + sint32 elementSel = texInstruction->textureFetch.srcSel[componentIndex]; + if (elementSel < 4) + { + _emitRegisterChannelAccessCode(shaderContext, texInstruction->srcGpr, elementSel, interpretSrcAsType); + return; + } + const char* resultElemTable[4] = {"x","y","z","w"}; + if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + { + if( elementSel == 4 ) + src->add("as_type(0.0)"); + else if( elementSel == 5 ) + src->add("as_type(1.0)"); + } + else if(interpretSrcAsType == LATTE_DECOMPILER_DTYPE_FLOAT ) + { + if( elementSel == 4 ) + src->add("0.0"); + else if( elementSel == 5 ) + src->add("1.0"); + } +} + +static const char* _texGprAccessElemTable[8] = {"x","y","z","w","_","_","_","_"}; + +static char* _getTexGPRAccess(LatteDecompilerShaderContext* shaderContext, sint32 gprIndex, uint32 dataType, sint8 selX, sint8 selY, sint8 selZ, sint8 selW, char* tempBuffer) +{ + // as_type(R{}i.w) + *tempBuffer = '\0'; + uint8 elemCount = (selX >= 0 ? 1 : 0) + (selY >= 0 ? 1 : 0) + (selZ >= 0 ? 1 : 0) + (selW >= 0 ? 1 : 0); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (elemCount == 1) + strcat(tempBuffer, "as_type("); + else + strcat(tempBuffer, ("as_type(").c_str()); + } + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + ; // no conversion + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + strcat(tempBuffer, ")"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + strcat(tempBuffer, _getRegisterVarName(shaderContext, gprIndex)); + // _texGprAccessElemTable + strcat(tempBuffer, "."); + if (selX >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selX]); + if (selY >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selY]); + if (selZ >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selZ]); + if (selW >= 0) + strcat(tempBuffer, _texGprAccessElemTable[selW]); + if (dataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + cemu_assert_unimplemented(); + else if (dataType == LATTE_DECOMPILER_DTYPE_FLOAT) + ; // no conversion + else + cemu_assert_unimplemented(); + } + else + cemu_assert_unimplemented(); + return tempBuffer; +} + +static void _emitTEXSampleTextureCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->textureFetch.textureIndex < 0 || texInstruction->textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) + { + // skip out of bounds texture unit access + return; + } + + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + char tempBuffer0[32]; + char tempBuffer1[32]; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f = 0; f < 4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + // texture sampler opcode + uint32 texOpcode = texInstruction->opcode; + // TODO: is this needed? + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + // vertex shader forces LOD to zero, but certain sampler types don't support textureLod(...) API + if (texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + texOpcode = GPU7_TEX_INST_SAMPLE_C; + } + // check if offset is used + bool hasOffset = false; + if( texInstruction->textureFetch.offsetX != 0 || texInstruction->textureFetch.offsetY != 0 || texInstruction->textureFetch.offsetZ != 0 ) + hasOffset = true; + // emit sample code + if (shaderContext->shader->textureIsIntegerFormat[texInstruction->textureFetch.textureIndex]) + { + // integer samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) // uint to int + { + if (numWrittenElements == 1) + src->add(" = int("); + else + shaderContext->shaderSource->addFmt(" = int{}(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } + } + else + { + // float samplers + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numWrittenElements == 1) + src->add(" = as_type("); + else + shaderContext->shaderSource->addFmt(" = as_type(", numWrittenElements); + } + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->add(" = ("); + } + + bool isCompare = shaderContext->shader->textureUsesDepthCompare[texInstruction->textureFetch.textureIndex]; + bool emulateCompare = (isCompare && !IsValidDepthTextureType(texDim)); + bool isGather = (texOpcode == GPU7_TEX_INST_FETCH4); + + bool unnormalizationHandled = false; + bool useTexelCoordinates = false; + bool isRead = ((texOpcode == GPU7_TEX_INST_SAMPLE && (texInstruction->textureFetch.unnormalized[0] && texInstruction->textureFetch.unnormalized[1] && texInstruction->textureFetch.unnormalized[2] && texInstruction->textureFetch.unnormalized[3])) || texOpcode == GPU7_TEX_INST_LD); + + // handle illegal combinations + if (texOpcode == GPU7_TEX_INST_FETCH4 && (texDim == Latte::E_DIM::DIM_1D || texDim == Latte::E_DIM::DIM_1D_ARRAY)) + { + // fetch4 is not allowed on 1D textures + // seen in YWW during boss fight of Level 1-4 + // todo - investigate what this returns on actual HW + if (numWrittenElements == 1) + shaderContext->shaderSource->add("0.0"); + else + shaderContext->shaderSource->addFmt("float{}(0.0)", numWrittenElements); + shaderContext->shaderSource->add(");" _CRLF); + return; + } + + // Do a framebuffer fetch if possible + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex]; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) + { + // TODO: support comparison samplers + // TODO: support swizzling + src->addFmt("col{}", renderTargetIndex); + } + else + { + // sample_compare returns a float, need to convert to float4 + if (isCompare) + src->addFmt("float4("); + + if (emulateCompare) + { + cemu_assert_debug(!isGather); + + src->add("sampleCompareEmulate("); + } + + src->addFmt("tex{}", texInstruction->textureFetch.textureIndex); + if (!emulateCompare) + { + src->add("."); + if (isRead) + { + if (hasOffset) + cemu_assert_unimplemented(); + src->add("read("); + unnormalizationHandled = true; + useTexelCoordinates = true; + } + else + { + if (isGather) + src->add("gather"); + else + src->add("sample"); + if (isCompare) + src->add("_compare"); + src->addFmt("(samplr{}, ", texInstruction->textureFetch.textureIndex); + } + } + else + { + src->addFmt(", samplr{}, ", texInstruction->textureFetch.textureIndex); + } + + // for textureGather() add shift (todo: depends on rounding mode set in sampler registers?) + if (texOpcode == GPU7_TEX_INST_FETCH4) + { + if (texDim == Latte::E_DIM::DIM_2D) + { + //src->addFmt2("(vec2(-0.1) / vec2(textureSize(tex{},0).xy)) + ", texInstruction->textureIndex); + + // vec2(-0.00001) is minimum to break Nvidia + // vec2(0.0001) is minimum to fix shadows on Intel, also fixes it on AMD (Windows and Linux) + + // todo - emulating coordinate rounding mode correctly is tricky + // GX2 supports two modes: Truncate or rounding according to DX9 rules + // Vulkan uses truncate mode when point sampling (min and mag is both nearest) otherwise it uses rounding + + // adding a small fixed bias is enough to avoid vendor-specific cases where small inaccuracies cause the number to get rounded down due to truncation + src->addFmt("float2(0.0001) + "); + } + } + + const sint32 texCoordDataType = (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT; + if(useTexelCoordinates) + { + // handle integer coordinates for texelFetch + if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + { + src->add("uint2("); + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, texCoordDataType); + src->addFmt(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, texCoordDataType); + + src->addFmt(")*supportBuffer.tex{}Scale", texInstruction->textureFetch.textureIndex); // close float2 and scale + + src->add("), 0"); // close int2 and lod param + // todo - lod + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // VC DS games forget to initialize textures and use texel fetch on an uninitialized texture (a dim of 0 maps to 1D) + src->add("uint("); + src->add("float("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, (texOpcode == GPU7_TEX_INST_LD) ? LATTE_DECOMPILER_DTYPE_SIGNED_INT : LATTE_DECOMPILER_DTYPE_FLOAT); + src->addFmt(")*supportBuffer.tex{}Scale.x", texInstruction->textureFetch.textureIndex); + src->add("), 0"); + // todo - lod + } + else + cemu_assert_debug(false); + } + else /* useTexelCoordinates == false */ + { + // float coordinates + if ( (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_L || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) ) + { + // shadow sampler + if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + compare value + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + + src->addFmt(", {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer0)); + } + else if (texDim == Latte::E_DIM::DIM_CUBEMAP) + { + // 2 coords + faceId + if (texInstruction->textureFetch.srcSel[0] >= 4 || texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->addFmt(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if (texDim == Latte::E_DIM::DIM_1D) + { + // 1 coord + 1 unused coord (per spec) + compare value + if (texInstruction->textureFetch.srcSel[0] >= 4) + { + debugBreakpoint(); + } + src->addFmt("{}, {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + else + { + // 2 coords + compare value (as float3) + if (texInstruction->textureFetch.srcSel[0] >= 4 && texInstruction->textureFetch.srcSel[1] >= 4) + { + debugBreakpoint(); + } + src->addFmt("float2({}), {}", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0), _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[3], -1, -1, -1, tempBuffer1)); + } + } + else if(texDim == Latte::E_DIM::DIM_2D_ARRAY) + { + // 3 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("), uint(rint("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add("))"); + } + else if(texDim == Latte::E_DIM::DIM_3D) + { + // 3 coords + src->add("float3("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(", "); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + else if( texDim == Latte::E_DIM::DIM_CUBEMAP ) + { + // 2 coords + faceId + cemu_assert_debug(texInstruction->textureFetch.srcSel[0] < 4); + cemu_assert_debug(texInstruction->textureFetch.srcSel[1] < 4); + src->addFmt("redcCUBEReverse({},", _getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], -1, -1, tempBuffer0)); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 2, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + src->add(")"); + src->addFmt(", uint(cubeMapArrayIndex{})", texInstruction->textureFetch.textureIndex); // cubemap index + } + else if( texDim == Latte::E_DIM::DIM_1D ) + { + // 1 coord + src->add(_getTexGPRAccess(shaderContext, texInstruction->srcGpr, LATTE_DECOMPILER_DTYPE_FLOAT, texInstruction->textureFetch.srcSel[0], -1, -1, -1, tempBuffer0)); + } + else + { + // 2 coords + src->add("float2("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 0, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(","); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 1, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + // avoid truncate to effectively round downwards on texel edges + if (ActiveSettings::ForceSamplerRoundToPrecision()) + src->addFmt("+ float2(1.0)/float2(tex{}.get_width(), tex{}.get_height())/512.0", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } + // lod or lod bias parameter + // 1D textures don't support lod + if (texDim != Latte::E_DIM::DIM_1D && texDim != Latte::E_DIM::DIM_1D_ARRAY) + { + if (texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LB || texOpcode == GPU7_TEX_INST_SAMPLE_C_L) + { + src->add(", "); + if (texOpcode == GPU7_TEX_INST_SAMPLE_LB) + { + src->addFmt("bias({})", _FormatFloatAsConstant((float)texInstruction->textureFetch.lodBias / 16.0f)); + } + else + { + // TODO: is this correct? + src->add("level("); + _emitTEXSampleCoordInputComponent(shaderContext, texInstruction, 3, LATTE_DECOMPILER_DTYPE_FLOAT); + src->add(")"); + } + } + else if (texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add(", level(0.0)"); + } + } + } + // gradient parameters + if (texOpcode == GPU7_TEX_INST_SAMPLE_G) + { + if (texDim == Latte::E_DIM::DIM_2D || + texDim == Latte::E_DIM::DIM_1D) + { + src->add(", gradient2d(gradH.xy, gradV.xy)"); + } + else + { + cemu_assert_unimplemented(); + } + } + + // offset + if( texOpcode == GPU7_TEX_INST_SAMPLE_L || texOpcode == GPU7_TEX_INST_SAMPLE_LZ || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ || texOpcode == GPU7_TEX_INST_SAMPLE || texOpcode == GPU7_TEX_INST_SAMPLE_C ) + { + if( hasOffset ) + { + uint8 offsetComponentCount = 0; + if( texDim == Latte::E_DIM::DIM_1D ) + offsetComponentCount = 1; + else if( texDim == Latte::E_DIM::DIM_2D ) + offsetComponentCount = 2; + else if( texDim == Latte::E_DIM::DIM_3D ) + offsetComponentCount = 3; + else if( texDim == Latte::E_DIM::DIM_2D_ARRAY ) + offsetComponentCount = 2; + else + cemu_assert_unimplemented(); + + if( (texInstruction->textureFetch.offsetX&1) ) + cemu_assert_unimplemented(); + if( (texInstruction->textureFetch.offsetY&1) ) + cemu_assert_unimplemented(); + if ((texInstruction->textureFetch.offsetZ & 1)) + cemu_assert_unimplemented(); + + if( offsetComponentCount == 1 ) + src->addFmt(",{}", texInstruction->textureFetch.offsetX/2); + else if( offsetComponentCount == 2 ) + src->addFmt(",int2({},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + else if( offsetComponentCount == 3 ) + src->addFmt(",int3({},{},{})", texInstruction->textureFetch.offsetX/2, texInstruction->textureFetch.offsetY/2, texInstruction->textureFetch.offsetZ/2); + } + } + + // lod bias (TODO: wht?) + + src->add(")"); + } + + if (isCompare) + src->add(")"); + + if (texOpcode == GPU7_TEX_INST_SAMPLE_C || texOpcode == GPU7_TEX_INST_SAMPLE_C_LZ) + { + src->add("."); + + if (numWrittenElements > 1) + { + // result is copied into multiple channels + for (sint32 f = 0; f < numWrittenElements; f++) + { + cemu_assert_debug(texInstruction->dstSel[f] == 0); // only x component is defined + src->add("x"); + } + } + else + { + src->add("x"); + } + } + else + { + src->add("."); + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + uint8 elemIndex = texInstruction->dstSel[f]; + if (isGather) + { + // 's textureGather() and GPU7's FETCH4 instruction have a different order of elements + // xyzw: top-left, top-right, bottom-right, bottom-left + // textureGather xyzw + // fetch4 yzxw + // translate index from fetch4 to textureGather order + static uint8 fetchToGather[4] = + { + 2, // x -> z + 0, // y -> x + 1, // z -> y + 3, // w -> w + }; + elemIndex = fetchToGather[elemIndex]; + } + src->add(resultElemTable[elemIndex]); + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + } + src->add(");"); + + // debug +#ifdef CEMU_DEBUG_ASSERT + if(texInstruction->opcode == GPU7_TEX_INST_LD ) + src->add(" // TEX_INST_LD"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE ) + src->add(" // TEX_INST_SAMPLE"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_L ) + src->add(" // TEX_INST_SAMPLE_L"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_LZ ) + src->add(" // TEX_INST_SAMPLE_LZ"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_C ) + src->add(" // TEX_INST_SAMPLE_C"); + else if(texInstruction->opcode == GPU7_TEX_INST_SAMPLE_G ) + src->add(" // TEX_INST_SAMPLE_G"); + else + src->addFmt(" // 0x{:02x}", texInstruction->opcode); + if (texInstruction->opcode != texOpcode) + src->addFmt(" (applied as 0x{:02x})", texOpcode); + src->addFmt(" OffsetXYZ {:02x} {:02x} {:02x}", (uint8)texInstruction->textureFetch.offsetX&0xFF, (uint8)texInstruction->textureFetch.offsetY&0xFF, (uint8)texInstruction->textureFetch.offsetZ&0xFF); +#endif + src->add("" _CRLF); +} + +static void _emitTEXGetTextureResInfoCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("R{}", texInstruction->dstGpr); + src->add("i"); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + // todo - mip index parameter? + + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + { + // TODO: use the render target size + src->addFmt(" = int4(1920, 1080, 1, 1)."); + } + else + { + auto texDim = shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex]; + + if (texDim == Latte::E_DIM::DIM_1D) + src->addFmt(" = int4(tex{}.get_width(), 1, 1, 1).", texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_1D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_array_size(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D || texDim == Latte::E_DIM::DIM_2D_MSAA) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else if (texDim == Latte::E_DIM::DIM_2D_ARRAY) + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), tex{}.get_array_size(), 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + else + { + cemu_assert_debug(false); + src->addFmt(" = int4(tex{}.get_width(), tex{}.get_height(), 1, 1).", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex); + } + } + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXGetCompTexLodCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && shaderContext->shader->textureRenderTargetIndex[texInstruction->textureFetch.textureIndex] != 255) + { + // We assume that textures accessed as framebuffer fetch are always sampled at pixel coordinates, therefore the lod would always be 0.0 + src->add("float4(0.0, 0.0, 0.0, 0.0)"); + } + else + { + if (shaderContext->shader->textureUnitDim[texInstruction->textureFetch.textureIndex] == Latte::E_DIM::DIM_CUBEMAP) + { + // 3 coordinates + if(shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]], resultElemTable[texInstruction->textureFetch.srcSel[2]]); + } + else + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, {}.{}{}), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + else + src->addFmt("float4(textureCalculateLod(tex{}, samplr{}, as_type({}.{}{})), 0.0, 0.0)", texInstruction->textureFetch.textureIndex, texInstruction->textureFetch.textureIndex, _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]], resultElemTable[texInstruction->textureFetch.srcSel[1]]); + debugBreakpoint(); + } + } + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + src->add("."); + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(";" _CRLF); +} + +static void _emitTEXSetCubemapIndexCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->addFmt("cubeMapArrayIndex{}", texInstruction->textureFetch.textureIndex); + const char* resultElemTable[4] = {"x","y","z","w"}; + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt(" = as_type(R{}i.{});" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt(" = R{}f.{};" _CRLF, texInstruction->srcGpr, resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + cemu_assert_unimplemented(); +} + +static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + sint32 componentCount = 0; + for (sint32 i = 0; i < 4; i++) + { + if (texInstruction->dstSel[i] == 7) + continue; + componentCount++; + } + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + + const char* funcName; + if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) + funcName = "dfdx"; + else + funcName = "dfdy"; + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType, componentCount); + + src->addFmt("{}(", funcName); + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, (componentCount >= 1) ? texInstruction->textureFetch.srcSel[0] : -1, (componentCount >= 2) ? texInstruction->textureFetch.srcSel[1] : -1, (componentCount >= 3) ? texInstruction->textureFetch.srcSel[2] : -1, (componentCount >= 4) ? texInstruction->textureFetch.srcSel[3] : -1, LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(")"); + + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_FLOAT, shaderContext->typeTracker.defaultDataType); + + src->add(";" _CRLF); + +} + +static void _emitTEXSetGradientsHV(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (texInstruction->opcode == GPU7_TEX_INST_SET_GRADIENTS_H) + src->add("gradH = "); + else + src->add("gradV = "); + + _emitRegisterAccessCode(shaderContext, texInstruction->srcGpr, texInstruction->textureFetch.srcSel[0], texInstruction->textureFetch.srcSel[1], texInstruction->textureFetch.srcSel[2], texInstruction->textureFetch.srcSel[3], LATTE_DECOMPILER_DTYPE_FLOAT); + + src->add(";" _CRLF); +} + +static void _emitGSReadInputVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + + src->add("."); + + const char* resultElemTable[4] = {"x","y","z","w"}; + sint32 numWrittenElements = 0; + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + + src->add(" = "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType, numWrittenElements); + src->add("(objectPayload.vertexOut["); + if (texInstruction->textureFetch.srcSel[0] >= 4) + cemu_assert_unimplemented(); + if (texInstruction->textureFetch.srcSel[1] >= 4) + cemu_assert_unimplemented(); + src->add("vertexIndex"); + src->addFmt("].passParameterSem{}.", texInstruction->textureFetch.offset/16); + + + for(sint32 f=0; f<4; f++) + { + if( texInstruction->dstSel[f] < 4 ) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if( texInstruction->dstSel[f] == 7 ) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + src->add(")"); + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + src->add(";" _CRLF); +} + +static sint32 _writeDestMaskXYZW(LatteDecompilerShaderContext* shaderContext, sint8* dstSel) +{ + StringBuf* src = shaderContext->shaderSource; + const char* resultElemTable[4] = { "x","y","z","w" }; + sint32 numWrittenElements = 0; + for (sint32 f = 0; f < 4; f++) + { + if (dstSel[f] < 4) + { + src->add(resultElemTable[f]); + numWrittenElements++; + } + else if (dstSel[f] == 7) + { + // masked and not written + } + else + { + cemu_assert_unimplemented(); + } + } + return numWrittenElements; +} + +static void _emitTEXVFetchCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + // handle special case where geometry shader reads input attributes from vertex shader via ringbuffer + StringBuf* src = shaderContext->shaderSource; + if( texInstruction->textureFetch.textureIndex == 0x9F && shaderContext->shaderType == LatteConst::ShaderType::Geometry ) + { + _emitGSReadInputVFetchCode(shaderContext, texInstruction); + return; + } + + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + + _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + const char* resultElemTable[4] = {"x","y","z","w"}; + uint32 numWrittenElements = 0; + for (sint32 f=0; f<4; f++) + { + if (texInstruction->dstSel[f] < 4) + numWrittenElements++; + } + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numWrittenElements == 1) + src->add("as_type("); + else + src->addFmt("as_type(", numWrittenElements); + } + else + src->add("("); + + src->addFmt("ubuff{}.d[", texInstruction->textureFetch.textureIndex - 0x80); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{}", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + else + src->addFmt("as_type({}.{})", _getRegisterVarName(shaderContext, texInstruction->srcGpr), resultElemTable[texInstruction->textureFetch.srcSel[0]]); + src->add("]."); + + + for (sint32 f=0; f<4; f++) + { + if (texInstruction->dstSel[f] < 4) + { + src->add(resultElemTable[texInstruction->dstSel[f]]); + } + else if (texInstruction->dstSel[f] == 7) + { + // masked and not written + } + else + { + debugBreakpoint(); + } + } + src->add(");" _CRLF); +} + +static void _emitTEXReadMemCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerTEXInstruction* texInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + src->add(_getRegisterVarName(shaderContext, texInstruction->dstGpr)); + src->add("."); + sint32 count = _writeDestMaskXYZW(shaderContext, texInstruction->dstSel); + + src->add(" = "); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (count == 1) + src->add("as_type("); + else + src->addFmt("as_type(", count); + } + else + src->add("("); + + sint32 readCount; + + if (texInstruction->memRead.format == FMT_32_FLOAT) + { + readCount = 1; + // todo + src->add("0.0"); + } + else if (texInstruction->memRead.format == FMT_32_32_FLOAT) + { + readCount = 2; + // todo + src->add("float2(0.0,0.0)"); + } + else if (texInstruction->memRead.format == FMT_32_32_32_FLOAT) + { + readCount = 3; + // todo + src->add("float3(0.0,0.0,0.0)"); + } + else + { + cemu_assert_unimplemented(); + } + + if (count < readCount) + { + if (count == 1) + src->add(".x"); + else if (count == 2) + src->add(".xy"); + else if (count == 3) + src->add(".xyz"); + } + src->add(");" _CRLF); +} + +static void _emitTEXClauseCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + cemu_assert_debug(cfInstruction->instructionsALU.empty()); + for(auto& texInstruction : cfInstruction->instructionsTEX) + { + if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || texInstruction.opcode == GPU7_TEX_INST_FETCH4 || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) + _emitTEXSampleTextureCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_TEXTURE_RESINFO ) + _emitTEXGetTextureResInfoCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_GET_COMP_TEX_LOD ) + _emitTEXGetCompTexLodCode(shaderContext, &texInstruction); + else if( texInstruction.opcode == GPU7_TEX_INST_SET_CUBEMAP_INDEX ) + _emitTEXSetCubemapIndexCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_V) + _emitTEXGetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_H || + texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_V) + _emitTEXSetGradientsHV(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_VFETCH) + _emitTEXVFetchCode(shaderContext, &texInstruction); + else if (texInstruction.opcode == GPU7_TEX_INST_MEM) + _emitTEXReadMemCode(shaderContext, &texInstruction); + else + cemu_assert_unimplemented(); + } +} + +// generate the code for reading the source input GPR (or constants) for exports +static void _emitExportGPRReadCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, sint32 requiredType, uint32 burstIndex) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 numOutputs = 4; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + numOutputs = (cfInstruction->memWriteCompMask&1)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&2)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&4)?1:0; + numOutputs += (cfInstruction->memWriteCompMask&8)?1:0; + } + if (requiredType == LATTE_DECOMPILER_DTYPE_FLOAT) + { + if(numOutputs == 1) + src->add("float("); + else + src->addFmt("float{}(", numOutputs); + } + else if (requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + { + if (numOutputs == 1) + src->add("int("); + else + src->addFmt("int{}(", numOutputs); + } + else + cemu_assert_unimplemented(); + sint32 actualOutputs = 0; + for(sint32 i=0; i<4; i++) + { + // todo: Use type of register element based on information from type tracker (currently we assume it's always a signed integer) + uint32 exportSel = 0; + if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + exportSel = i; + if( (cfInstruction->memWriteCompMask&(1<exportComponentSel[i]; + } + if( actualOutputs > 0 ) + src->add(", "); + actualOutputs++; + if( exportSel < 4 ) + { + _emitRegisterAccessCode(shaderContext, cfInstruction->exportSourceGPR+burstIndex, exportSel, -1, -1, -1, requiredType); + } + else if (exportSel == 4) + { + // constant zero + src->add("0"); + } + else if (exportSel == 5) + { + // constant one + src->add("1.0"); + } + else if( exportSel == 7 ) + { + // element masked (which means 0 is exported?) + src->add("0"); + } + else + { + cemu_assert_debug(false); + src->add("0"); + } + } + if( requiredType == LATTE_DECOMPILER_DTYPE_FLOAT ) + src->add(")"); + else if( requiredType == LATTE_DECOMPILER_DTYPE_SIGNED_INT ) + src->add(")"); + else + cemu_assert_unimplemented(); +} + +static void _emitExportCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + if (!rasterizationEnabled) + return; + + StringBuf* src = shaderContext->shaderSource; + src->add("// export" _CRLF); + if(shaderContext->shaderType == LatteConst::ShaderType::Vertex ) + { + if( cfInstruction->exportBurstCount != 0 ) + debugBreakpoint(); + if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + // export position + // GX2 special state 0 disables rasterizer viewport offset and scaling (probably, exact mechanism is not known). Handle this here + bool hasAnyViewportScaleDisabled = + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (hasAnyViewportScaleDisabled) + { + src->add("float4 finalPos = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + src->add("finalPos.xy = finalPos.xy * supportBuffer.windowSpaceToClipSpaceTransform - float2(1.0,1.0);" _CRLF); + src->add("SET_POSITION(finalPos);"); + } + else + { + src->add("SET_POSITION("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(");" _CRLF); + } + } + else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE ) + { + // export gl_PointSize + if (shaderContext->analyzer.outputPointSize) + { + cemu_assert_debug(shaderContext->analyzer.writesPointSize); + src->add("out.pointSize = ("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(").x"); + src->add(";" _CRLF); + } + } + else if( cfInstruction->exportType == 2 && cfInstruction->exportArrayBase < 32 ) + { + // export parameter + sint32 paramIndex = cfInstruction->exportArrayBase; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, paramIndex); + if (vsSemanticId != 0xFF) + { + src->addFmt("out.passParameterSem{} = ", vsSemanticId); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(";" _CRLF); + } + else + { + src->add("// skipped export to semanticId 255" _CRLF); + } + } + else + cemu_assert_unimplemented(); + } + else if(shaderContext->shaderType == LatteConst::ShaderType::Pixel ) + { + if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) + { + for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) + { + sint32 pixelColorOutputIndex = LatteDecompiler_getColorOutputIndexFromExportIndex(shaderContext, cfInstruction->exportArrayBase+i); + // if color output is for target 0, then also handle alpha test + bool alphaTestEnable = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + auto alphaTestFunc = shaderContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_FUNC(); + if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc == Latte::E_COMPAREFUNC::NEVER ) + { + // never pass alpha test + src->add("discard_fragment();" _CRLF); + } + else if( pixelColorOutputIndex == 0 && alphaTestEnable && alphaTestFunc != Latte::E_COMPAREFUNC::ALWAYS) + { + src->add("if( (("); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(").a "); + + switch( alphaTestFunc ) + { + case Latte::E_COMPAREFUNC::LESS: + src->add("<"); + break; + case Latte::E_COMPAREFUNC::EQUAL: + src->add("=="); + break; + case Latte::E_COMPAREFUNC::LEQUAL: + src->add("<="); + break; + case Latte::E_COMPAREFUNC::GREATER: + src->add(">"); + break; + case Latte::E_COMPAREFUNC::NOTEQUAL: + src->add("!="); + break; + case Latte::E_COMPAREFUNC::GEQUAL: + src->add(">="); + break; + } + src->add(" supportBuffer.alphaTestRef"); + src->add(") == false) discard_fragment();" _CRLF); + } + // pixel color output + auto dataType = GetColorBufferDataType(pixelColorOutputIndex, *shaderContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("out.passPixelColor{} = as_type<{}>(", pixelColorOutputIndex, GetDataTypeStr(dataType)); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, i); + src->add(");" _CRLF); + } + + if( cfInstruction->exportArrayBase+i >= 8 ) + cemu_assert_unimplemented(); + } + } + else if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase == 61 ) + { + // pixel depth or gl_FragStencilRefARB + if( cfInstruction->exportBurstCount > 0 ) + cemu_assert_unimplemented(); + + if (cfInstruction->exportComponentSel[0] == 7) + { + cemu_assert_unimplemented(); // gl_FragDepth ? + } + if (cfInstruction->exportComponentSel[1] != 7) + { + cemu_assert_unimplemented(); // exporting to gl_FragStencilRefARB + } + if (cfInstruction->exportComponentSel[2] != 7) + { + cemu_assert_unimplemented(); // ukn + } + if (cfInstruction->exportComponentSel[3] != 7) + { + cemu_assert_unimplemented(); // ukn + } + + if (!shaderContext->shader->depthMask) + return; + + src->add("out.passDepth = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, 0); + src->add(".x"); + src->add(";" _CRLF); + } + else + cemu_assert_unimplemented(); + } +} + +static void _emitXYZWByMask(StringBuf* src, uint32 mask) +{ + if( (mask&(1<<0)) != 0 ) + src->add("x"); + if( (mask&(1<<1)) != 0 ) + src->add("y"); + if( (mask&(1<<2)) != 0 ) + src->add("z"); + if( (mask&(1<<3)) != 0 ) + src->add("w"); +} + +static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + // calculate parameter output (based on ring buffer output offset relative to GS unit) + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + bytesPerVertex = std::max(bytesPerVertex, (uint32)1); // avoid division by zero + uint32 parameterOffset = ((cfInstruction->exportArrayBase * 4) % bytesPerVertex); + // for geometry shaders with streamout, MEM_RING_WRITE is used to pass the data to the copy shader, which then uses STREAM*_WRITE + if (shaderContext->shaderType == LatteConst::ShaderType::Geometry && shaderContext->analyzer.hasStreamoutEnable) + { + // if streamout is enabled, we generate transform feedback output code instead of the normal gs output + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + parameterOffset = ((cfInstruction->exportArrayBase * 4 + burstIndex*0x10) % bytesPerVertex); + // find matching stream write in copy shader + LatteGSCopyShaderStreamWrite_t* streamWrite = nullptr; + for (auto& it : shaderContext->parsedGSCopyShader->list_streamWrites) + { + if (it.offset == parameterOffset) + { + streamWrite = ⁢ + break; + } + } + if (streamWrite == nullptr) + { + cemu_assert_suspicious(); + return; + } + + for (sint32 i = 0; i < 4; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->addFmt("{}.", _getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR+burstIndex)); + if (i == 0) + src->add("x"); + else if (i == 1) + src->add("y"); + else if (i == 2) + src->add("z"); + else if (i == 3) + src->add("w"); + + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + return; + } + + if (!rasterizationEnabled) + return; + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + if (cfInstruction->memWriteElemSize != 3) + cemu_assert_unimplemented(); + if ((cfInstruction->exportArrayBase & 3) != 0) + cemu_assert_unimplemented(); + for (sint32 burstIndex = 0; burstIndex < (sint32)(cfInstruction->exportBurstCount + 1); burstIndex++) + { + src->addFmt("out.passParameterSem{}.", (cfInstruction->exportArrayBase) / 4 + burstIndex); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_SIGNED_INT, burstIndex); + src->add(";" _CRLF); + } + } + else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) + { + cemu_assert_debug(cfInstruction->memWriteElemSize == 3); + //if (cfInstruction->memWriteElemSize != 3) + // debugBreakpoint(); + cemu_assert_debug((cfInstruction->exportArrayBase & 3) == 0); + + for (uint32 burstIndex = 0; burstIndex < (cfInstruction->exportBurstCount + 1); burstIndex++) + { + uint32 parameterExportType = 0; + uint32 parameterExportBase = 0; + if (LatteGSCopyShaderParser_getExportTypeByOffset(shaderContext->parsedGSCopyShader, parameterOffset + burstIndex * (cfInstruction->memWriteElemSize+1)*4, ¶meterExportType, ¶meterExportBase) == false) + { + cemu_assert_debug(false); + shaderContext->hasError = true; + return; + } + + if (parameterExportType == 1 && parameterExportBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION) + { + src->add("{" _CRLF); + src->addFmt("float4 pos = float4(0.0,0.0,0.0,1.0);" _CRLF); + src->addFmt("pos."); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + src->add("SET_POSITION(pos);" _CRLF); + src->add("}" _CRLF); + } + else if (parameterExportType == 2 && parameterExportBase < 16) + { + src->addFmt("out.passParameterSem{}.", parameterExportBase); + _emitXYZWByMask(src, cfInstruction->memWriteCompMask); + src->addFmt(" = "); + _emitExportGPRReadCode(shaderContext, cfInstruction, LATTE_DECOMPILER_DTYPE_FLOAT, burstIndex); + src->add(";" _CRLF); + } + else + cemu_assert_debug(false); + } + } + else + debugBreakpoint(); // todo +} + +static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + if (shaderContext->analyzer.hasStreamoutEnable == false) + { +#ifdef CEMU_DEBUG_ASSERT + src->add("// omitted streamout write" _CRLF); +#endif + return; + } + uint32 streamoutBufferIndex; + if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE) + streamoutBufferIndex = 0; + else if (cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE) + streamoutBufferIndex = 1; + else + cemu_assert_unimplemented(); + + if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) + { + uint32 arraySize = cfInstruction->memWriteArraySize + 1; + + for (sint32 i = 0; i < (sint32)arraySize; i++) + { + if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) + continue; + + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); + + src->add(" = "); + + _emitTypeConversionPrefixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(_getRegisterVarName(shaderContext, cfInstruction->exportSourceGPR)); + _appendChannelAccess(src, i); + _emitTypeConversionSuffixMSL(shaderContext, shaderContext->typeTracker.defaultDataType, LATTE_DECOMPILER_DTYPE_SIGNED_INT); + + src->add(";" _CRLF); + } + } + else + cemu_assert_debug(false); +} + +static void _emitCFCall(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) +{ + StringBuf* src = shaderContext->shaderSource; + uint32 subroutineAddr = cfInstruction->addr; + LatteDecompilerSubroutineInfo* subroutineInfo = nullptr; + // find subroutine + for (auto& subroutineItr : shaderContext->list_subroutines) + { + if (subroutineItr.cfAddr == subroutineAddr) + { + subroutineInfo = &subroutineItr; + break; + } + } + if (subroutineInfo == nullptr) + { + cemu_assert_debug(false); + return; + } + // inline function + if (shaderContext->isSubroutine) + { + cemu_assert_debug(false); // inlining with cascaded function calls not supported + return; + } + // init CF stack variables + src->addFmt("activeMaskStackSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[0] = true;" _CRLF, subroutineInfo->cfAddr); + src->addFmt("activeMaskStackCSub{:04x}[1] = true;" _CRLF, subroutineInfo->cfAddr); + + shaderContext->isSubroutine = true; + shaderContext->subroutineInfo = subroutineInfo; + for(auto& cfInstruction : subroutineInfo->instructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, true); + shaderContext->isSubroutine = false; + shaderContext->subroutineInfo = nullptr; +} + +void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction, bool isSubroutine) +{ + StringBuf* src = shaderContext->shaderSource; + + if( cfInstruction->type == GPU7_CF_INST_ALU || cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_BREAK || cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // emit ALU code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + if(cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1)); + else + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + if (cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE) + { + src->addFmt("{} = {};" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth-1)); + src->addFmt("{} = {};" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + _emitALUClauseCode(shaderContext, cfInstruction); + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + cemu_assert_debug(!(shaderContext->analyzer.modifiesPixelActiveState == false && cfInstruction->type != GPU7_CF_INST_ALU)); + // handle ELSE case of PUSH_BEFORE + if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE ) + { + src->add("else {" _CRLF); + src->addFmt("{} = false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = false;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("}" _CRLF); + } + // post clause handler + if( cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 1), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 1)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - 2), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - 2), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - 2)); + } + else if( cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) + { + // no condition test + // pop stack + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + // else operation + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + } + else if( cfInstruction->type == GPU7_CF_INST_TEX ) + { + // emit TEX code + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth+1)); + } + _emitTEXClauseCode(shaderContext, cfInstruction); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->add("}" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_EXPORT || cfInstruction->type == GPU7_CF_INST_EXPORT_DONE ) + { + // emit export code + _emitExportCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_ELSE ) + { + // todo: Condition test, popCount? + src->addFmt("{} = {} == false;" _CRLF, _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth)); + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth)); + } + else if( cfInstruction->type == GPU7_CF_INST_POP ) + { + src->addFmt("{} = {} == true && {} == true;" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1 - cfInstruction->popCount), _getActiveMaskVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount), _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth - cfInstruction->popCount)); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_START_DX10 || + cfInstruction->type == GPU7_CF_INST_LOOP_START_NO_AL) + { + // start of loop + // if pixel is disabled, then skip loop + if (ActiveSettings::ShaderPreventInfiniteLoopsEnabled()) + { + // with iteration limit to prevent infinite loops + src->addFmt("int loopCounter{} = 0;" _CRLF, (sint32)cfInstruction->cfAddr); + src->addFmt("while( {} == true && loopCounter{} < 500 )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1), (sint32)cfInstruction->cfAddr); + src->add("{" _CRLF); + src->addFmt("loopCounter{}++;" _CRLF, (sint32)cfInstruction->cfAddr); + } + else + { + src->addFmt("while( {} == true )" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + src->add("{" _CRLF); + } + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_END ) + { + // this might not always work + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + src->add("}" _CRLF); + } + else if( cfInstruction->type == GPU7_CF_INST_LOOP_BREAK ) + { + if( cfInstruction->popCount != 0 ) + debugBreakpoint(); + if (shaderContext->analyzer.modifiesPixelActiveState) + { + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + } + // note: active stack level is set to the same level as the loop begin. popCount is ignored + src->add("break;" _CRLF); + + if (shaderContext->analyzer.modifiesPixelActiveState) + src->add("}" _CRLF); + + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_STREAM0_WRITE || + cfInstruction->type == GPU7_CF_INST_MEM_STREAM1_WRITE ) + { + _emitStreamWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_MEM_RING_WRITE ) + { + _emitCFRingWriteCode(shaderContext, cfInstruction); + } + else if( cfInstruction->type == GPU7_CF_INST_EMIT_VERTEX ) + { + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->addFmt("if( {} == true ) {{" _CRLF, _getActiveMaskCVarName(shaderContext, cfInstruction->activeStackDepth + 1)); + // write point size + if (shaderContext->analyzer.outputPointSize && shaderContext->analyzer.writesPointSize == false) + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); + src->add("mesh.set_vertex(vertexIndex, out);" _CRLF); + src->add("vertexIndex++;" _CRLF); + // increment transform feedback pointer + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); + } + + if( shaderContext->analyzer.modifiesPixelActiveState ) + src->add("}" _CRLF); + } + else if (cfInstruction->type == GPU7_CF_INST_CALL) + { + _emitCFCall(shaderContext, cfInstruction); + } + else if (cfInstruction->type == GPU7_CF_INST_RETURN) + { + // todo (handle properly) + } + else + { + cemu_assert_debug(false); + } +} + +void LatteDecompiler_emitHelperFunctions(LatteDecompilerShaderContext* shaderContext, StringBuf* fCStr_shaderSource) +{ + if( shaderContext->analyzer.hasRedcCUBE ) + { + fCStr_shaderSource->add("void redcCUBE(float4 src0, float4 src1, thread float3& stm, thread int& faceId)\r\n" + "{\r\n" + "// stm -> x .. s, y .. t, z .. MajorAxis*2.0\r\n" + + "float3 inputCoord = normalize(float3(src1.y, src1.x, src0.x));\r\n" + + "float rx = inputCoord.x;\r\n" + "float ry = inputCoord.y;\r\n" + "float rz = inputCoord.z;\r\n" + "if( abs(rx) > abs(ry) && abs(rx) > abs(rz) )\r\n" + "{\r\n" + "stm.z = rx*2.0;\r\n" + "stm.xy = float2(ry,rz); \r\n" + "if( rx >= 0.0 )\r\n" + "{\r\n" + "faceId = 0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 1;\r\n" + "}\r\n" + "}\r\n" + "else if( abs(ry) > abs(rx) && abs(ry) > abs(rz) )\r\n" + "{\r\n" + "stm.z = ry*2.0;\r\n" + "stm.xy = float2(rx,rz); \r\n" + "if( ry >= 0.0 )\r\n" + "{\r\n" + "faceId = 2;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 3;\r\n" + "}\r\n" + "}\r\n" + "else //if( abs(rz) > abs(ry) && abs(rz) > abs(rx) )\r\n" + "{\r\n" + "stm.z = rz*2.0;\r\n" + "stm.xy = float2(rx,ry); \r\n" + "if( rz >= 0.0 )\r\n" + "{\r\n" + "faceId = 4;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "faceId = 5;\r\n" + "}\r\n" + "}\r\n" + "}\r\n"); + } + + if( shaderContext->analyzer.hasCubeMapTexture ) + { + fCStr_shaderSource->add("float3 redcCUBEReverse(float2 st, int faceId)\r\n" + "{\r\n" + "st.yx = st.xy;\r\n" + "float3 v;\r\n" + "float majorAxis = 1.0;\r\n" + "if( faceId == 0 )\r\n" + "{\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.x = 1.0;\r\n" + "}\r\n" + "else if( faceId == 1 )\r\n" + "{\r\n" + "v.yz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.x = -1.0;\r\n" + "}\r\n" + "else if( faceId == 2 )\r\n" + "{\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.y = 1.0;\r\n" + "}\r\n" + "else if( faceId == 3 )\r\n" + "{\r\n" + "v.xz = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.y = -1.0;\r\n" + "}\r\n" + "else if( faceId == 4 )\r\n" + "{\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.z = 1.0;\r\n" + "}\r\n" + "else\r\n" + "{\r\n" + "v.xy = (st-float2(1.5))*(majorAxis*2.0);\r\n" + "v.z = -1.0;\r\n" + "}\r\n" + + "return v;\r\n" + "}\r\n"); + } + + // Sample compare emulate + // TODO: only add when needed + // TODO: lod_options overload + // TODO: when the sampler has linear min mag filter, use gather and filter manually + // TODO: offset? + fCStr_shaderSource->add("" + "template\r\n" + "float sampleCompareEmulate(TextureT tex, sampler samplr, CoordT coord, float compareValue) {\r\n" + "return compareValue < tex.sample(samplr, coord).x ? 1.0 : 0.0;\r\n" + "}\r\n" + ); + + // Texture calculate lod + // TODO: only add when needed + fCStr_shaderSource->add("" + "template\r\n" + "float2 textureCalculateLod(TextureT tex, sampler samplr, CoordT coord) {\r\n" + "float lod = tex.calculate_unclamped_lod(samplr, coord);\r\n" + "return float2(floor(lod), fract(lod));\r\n" + "}\r\n"); + + // clamp + fCStr_shaderSource->add("" + "int clampFI32(int v)\r\n" + "{\r\n" + "if( v == 0x7FFFFFFF )\r\n" + " return as_type(1.0);\r\n" + "else if( v == 0xFFFFFFFF )\r\n" + " return as_type(0.0);\r\n" + "return as_type(clamp(as_type(v), 0.0, 1.0));\r\n" + "}\r\n"); + + // mul non-ieee way (0*NaN/INF => 0.0) + if (shaderContext->options->strictMul) + { + // things we tried: + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(a*b,0.0,a==0.0||b==0.0); }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return mix(vec2(a*b,0.0),vec2(0.0,0.0),(equal(vec2(a),vec2(0.0,0.0))||equal(vec2(b),vec2(0.0,0.0)))).x; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" STR_LINEBREAK); + //fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){float r = a*b;r = intBitsToFloat(floatBitsToInt(r)&(((floatBitsToInt(a) != 0) && (floatBitsToInt(b) != 0))?0xFFFFFFFF:0));return r;}" STR_LINEBREAK); works + + // for "min" it used to be: float mul_nonIEEE(float a, float b){ return min(a*b,min(abs(a)*3.40282347E+38F,abs(b)*3.40282347E+38F)); } + + if( LatteGPUState.glVendor == GLVENDOR_NVIDIA && !ActiveSettings::DumpShadersEnabled()) + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){return mix(0.0, a*b, (a != 0.0) && (b != 0.0));}" _CRLF); // compiles faster on Nvidia and also results in lower RAM usage (OpenGL) + else + fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ if( a == 0.0 || b == 0.0 ) return 0.0; return a*b; }" _CRLF); + + // DXKV-like: fCStr_shaderSource->add("float mul_nonIEEE(float a, float b){ return (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b); }" _CRLF); + } +} + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp" + +static void LatteDecompiler_emitAttributeImport(LatteDecompilerShaderContext* shaderContext, LatteParsedFetchShaderAttribute_t& attrib) +{ + auto src = shaderContext->shaderSource; + + static const char* dsMappingTableFloat[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", /*"floatBitsToInt(0.0)"*/ "0", /*"floatBitsToInt(1.0)"*/ "0x3f800000" }; + static const char* dsMappingTableInt[6] = { "int(attrDecoder.x)", "int(attrDecoder.y)", "int(attrDecoder.z)", "int(attrDecoder.w)", "0", "1" }; + + // get register index based on vtx semantic table + uint32 attributeShaderLoc = 0xFFFFFFFF; + for (sint32 f = 0; f < 32; f++) + { + if (shaderContext->contextRegisters[mmSQ_VTX_SEMANTIC_0 + f] == attrib.semanticId) + { + attributeShaderLoc = f; + break; + } + } + if (attributeShaderLoc == 0xFFFFFFFF) + return; // attribute is not mapped to VS input + uint32 registerIndex = attributeShaderLoc + 1; // R0 is skipped + // is register used? + if ((shaderContext->analyzer.gprUseMask[registerIndex / 8] & (1 << (registerIndex % 8))) == 0) + { + src->addFmt("// skipped unused attribute for r{}" _CRLF, registerIndex); + return; + } + + LatteDecompiler_emitAttributeDecodeMSL(shaderContext->shader, src, &attrib); + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(", _getRegisterVarName(shaderContext, registerIndex)); + else + src->addFmt("{} = float4(", _getRegisterVarName(shaderContext, registerIndex)); + for (sint32 f = 0; f < 4; f++) + { + uint8 ds = attrib.ds[f]; + if (f > 0) + src->add(", "); + _emitTypeConversionPrefixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + if (ds >= 6) + { + cemu_assert_unimplemented(); + ds = 4; // read as 0.0 + } + if (attrib.nfa != 1) + { + src->add(dsMappingTableFloat[ds]); + } + else + { + src->add(dsMappingTableInt[ds]); + } + _emitTypeConversionSuffixMSL(shaderContext, LATTE_DECOMPILER_DTYPE_SIGNED_INT, shaderContext->typeTracker.defaultDataType); + } + src->add(");" _CRLF); +} + +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) +{ + bool isRectVertexShader = (static_cast(shaderContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]) == LattePrimitiveMode::RECTS); + bool usesGeometryShader = (shaderContext->options->usesGeometryShader || isRectVertexShader); + bool fetchVertexManually = (usesGeometryShader || (shaderContext->fetchShader && shaderContext->fetchShader->mtlFetchVertexManually)); + + // Rasterization + rasterizationEnabled = true; + if (shader->shaderType == LatteConst::ShaderType::Vertex && !usesGeometryShader) + { + rasterizationEnabled = !shaderContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + if (!shaderContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizationEnabled = true; + + const auto& polygonControlReg = shaderContext->contextRegistersNew->PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + rasterizationEnabled = false; + } + + StringBuf* src = new StringBuf(1024*1024*12); // reserve 12MB for generated source (we resize-to-fit at the end) + shaderContext->shaderSource = src; + + // debug info + src->addFmt("// shader {:016x}" _CRLF, shaderContext->shaderBaseHash); +#ifdef CEMU_DEBUG_ASSERT + src->addFmt("// usesIntegerValues: {}" _CRLF, shaderContext->analyzer.usesIntegerValues ? "true" : "false"); + src->addFmt(_CRLF); +#endif + // include metal standard library + src->add("#include " _CRLF); + src->add("using namespace metal;" _CRLF); + // header part (definitions for inputs and outputs) + LatteDecompiler::emitHeader(shaderContext, isRectVertexShader, fetchVertexManually, rasterizationEnabled); + // helper functions + LatteDecompiler_emitHelperFunctions(shaderContext, src); + const char* functionType = ""; + const char* outputTypeName = ""; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + if (fetchVertexManually) + { + // TODO: clean this up + // fetchVertex will modify vid in case of an object shader and an indexed draw + + // Vertex buffers + std::string vertexBufferDefinitions = "#define VERTEX_BUFFER_DEFINITIONS "; + std::string vertexBuffers = "#define VERTEX_BUFFERS "; + std::string inputFetchDefinition = "VertexIn fetchVertex("; + if (usesGeometryShader) + inputFetchDefinition += "thread uint&"; + else + inputFetchDefinition += "uint"; + inputFetchDefinition += " vid, uint iid"; + if (usesGeometryShader) + inputFetchDefinition += ", device uint* indexBuffer, uchar indexType"; + inputFetchDefinition += " VERTEX_BUFFER_DEFINITIONS) {\n"; + + // Index buffer + if (usesGeometryShader) + { + inputFetchDefinition += "if (indexType == 1) // UShort\n"; + inputFetchDefinition += "vid = ((device ushort*)indexBuffer)[vid];\n"; + inputFetchDefinition += "else if (indexType == 2) // UInt\n"; + inputFetchDefinition += "vid = ((device uint*)indexBuffer)[vid];\n"; + } + + inputFetchDefinition += "VertexIn in;\n"; + for (auto& bufferGroup : shaderContext->fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (shaderContext->contextRegisters[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = shaderContext->output->resourceMappingMTL.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + std::string formatName; + uint8 componentCount = 0; + switch (GetMtlVertexFormat(attr.format)) + { + case MTL::VertexFormatUChar: + formatName = "uchar"; + componentCount = 1; + break; + case MTL::VertexFormatUChar2: + formatName = "uchar2"; + componentCount = 2; + break; + case MTL::VertexFormatUChar3: + formatName = "uchar3"; + componentCount = 3; + break; + case MTL::VertexFormatUChar4: + formatName = "uchar4"; + componentCount = 4; + break; + case MTL::VertexFormatUShort: + formatName = "ushort"; + componentCount = 1; + break; + case MTL::VertexFormatUShort2: + formatName = "ushort2"; + componentCount = 2; + break; + case MTL::VertexFormatUShort3: + formatName = "ushort3"; + componentCount = 3; + break; + case MTL::VertexFormatUShort4: + formatName = "ushort4"; + componentCount = 4; + break; + case MTL::VertexFormatUInt: + formatName = "uint"; + componentCount = 1; + break; + case MTL::VertexFormatUInt2: + formatName = "uint2"; + componentCount = 2; + break; + case MTL::VertexFormatUInt3: + formatName = "uint3"; + componentCount = 3; + break; + case MTL::VertexFormatUInt4: + formatName = "uint4"; + componentCount = 4; + break; + } + + // Get the fetch type + std::string fetchTypeStr; + if (attr.fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + fetchTypeStr = "vid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + fetchTypeStr = "iid"; + else if (attr.fetchType == LatteConst::VertexFetchType2::NO_INDEX_OFFSET_DATA) + fetchTypeStr = "0"; // TODO: correct? + + // Fetch the attribute + inputFetchDefinition += fmt::format("in.ATTRIBUTE_NAME{} = uint4(uint", semanticId); + if (componentCount != 1) + inputFetchDefinition += fmt::format("{}", componentCount); + inputFetchDefinition += fmt::format("(*(device {}*)", formatName); + inputFetchDefinition += fmt::format("(vertexBuffer{}", attr.attributeBufferIndex); + inputFetchDefinition += fmt::format(" + {} * {} + {}))", fetchTypeStr, bufferStride, attr.offset); + for (uint8 i = 0; i < (4 - componentCount); i++) + inputFetchDefinition += ", 0"; + inputFetchDefinition += ");\n"; + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + // TODO: fetch type + + vertexBufferDefinitions += fmt::format(", device uchar* vertexBuffer{} [[buffer({})]]", bufferIndex, GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + vertexBuffers += fmt::format(", vertexBuffer{}", bufferIndex); + } + + inputFetchDefinition += "return in;\n"; + inputFetchDefinition += "}\n"; + + src->add(vertexBufferDefinitions.c_str()); + src->add("\n"); + src->add(vertexBuffers.c_str()); + src->add("\n"); + src->add(inputFetchDefinition.c_str()); + } + + if (usesGeometryShader) + { + functionType = "[[object, max_total_threads_per_threadgroup(VERTICES_PER_VERTEX_PRIMITIVE), max_total_threadgroups_per_mesh_grid(1)]]"; + outputTypeName = "void"; + } + else + { + functionType = "vertex"; + if (rasterizationEnabled) + outputTypeName = "VertexOut"; + else + outputTypeName = "void"; + } + break; + case LatteConst::ShaderType::Geometry: + functionType = "[[mesh, max_total_threads_per_threadgroup(1)]]"; + outputTypeName = "void"; + break; + case LatteConst::ShaderType::Pixel: + functionType = "fragment"; + outputTypeName = "FragmentOut"; + break; + } + // start of main + src->addFmt("{} {} main0(", functionType, outputTypeName); + LatteDecompiler::emitInputs(shaderContext, isRectVertexShader, fetchVertexManually); + src->add(") {" _CRLF); + if (fetchVertexManually && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + if (usesGeometryShader) + { + // Calculate the imaginary vertex id + src->add("uint vid = tig * VERTICES_PER_VERTEX_PRIMITIVE + tid;" _CRLF); + src->add("uint iid = vid / supportBuffer.verticesPerInstance;" _CRLF); + src->add("vid %= supportBuffer.verticesPerInstance;" _CRLF); + + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid, indexBuffer, indexType VERTEX_BUFFERS);" _CRLF); + + // Output is defined as object payload + src->add("object_data VertexOut& out = objectPayload.vertexOut[tid];" _CRLF); + } + else + { + // Fetch the input + src->add("VertexIn in = fetchVertex(vid, iid VERTEX_BUFFERS);" _CRLF); + + if (rasterizationEnabled) + src->add("VertexOut out;" _CRLF); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("GeometryOut out;" _CRLF); + // The index of the current vertex that is being emitted + src->add("uint vertexIndex = 0;" _CRLF); + } + } + else + { + if (rasterizationEnabled) + src->addFmt("{} out;" _CRLF, outputTypeName); + } + // variable definition + if (shaderContext->typeTracker.useArrayGPRs == false) + { + // each register is a separate variable + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->analyzer.usesRelativeGPRRead || (shaderContext->analyzer.gprUseMask[i / 8] & (1 << (i & 7))) != 0) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 R{}i = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 R{}f = float4(0.0);" _CRLF, i); + } + } + } + else + { + // registers are represented using a single large array + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int4 Ri[128];" _CRLF); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float4 Rf[128];" _CRLF); + for (sint32 i = 0; i < 128; i++) + { + if (shaderContext->typeTracker.genIntReg) + src->addFmt("Ri[{}] = int4(0);" _CRLF, i); + else if (shaderContext->typeTracker.genFloatReg) + src->addFmt("Rf[{}] = float4(0.0);" _CRLF, i); + } + } + + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + src->addFmt("uint4 attrDecoder;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + src->addFmt("int backupReg0i, backupReg1i, backupReg2i, backupReg3i, backupReg4i;" _CRLF); + if (shaderContext->typeTracker.genFloatReg) + src->addFmt("float backupReg0f, backupReg1f, backupReg2f, backupReg3f, backupReg4f;" _CRLF); + if (shaderContext->typeTracker.genIntReg) + { + src->addFmt("int PV0ix = 0, PV0iy = 0, PV0iz = 0, PV0iw = 0, PV1ix = 0, PV1iy = 0, PV1iz = 0, PV1iw = 0;" _CRLF); + src->addFmt("int PS0i = 0, PS1i = 0;" _CRLF); + src->addFmt("int4 tempi = int4(0);" _CRLF); + } + if (shaderContext->typeTracker.genFloatReg) + { + src->addFmt("float PV0fx = 0.0, PV0fy = 0.0, PV0fz = 0.0, PV0fw = 0.0, PV1fx = 0.0, PV1fy = 0.0, PV1fz = 0.0, PV1fw = 0.0;" _CRLF); + src->addFmt("float PS0f = 0.0, PS1f = 0.0;" _CRLF); + src->addFmt("float4 tempf = float4(0.0);" _CRLF); + } + if (shaderContext->analyzer.hasGradientLookup) + { + src->add("float4 gradH;" _CRLF); + src->add("float4 gradV;" _CRLF); + } + src->add("float tempResultf;" _CRLF); + src->add("int tempResulti;" _CRLF); + src->add("int4 ARi = int4(0);" _CRLF); + src->add("bool predResult = true;" _CRLF); + if(shaderContext->analyzer.modifiesPixelActiveState ) + { + src->addFmt("bool activeMaskStack[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+1); + src->addFmt("bool activeMaskStackC[{}];" _CRLF, shaderContext->analyzer.activeStackMaxDepth+2); + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth; i++) + { + src->addFmt("activeMaskStack[{}] = false;" _CRLF, i); + } + for (sint32 i = 0; i < shaderContext->analyzer.activeStackMaxDepth+1; i++) + { + src->addFmt("activeMaskStackC[{}] = false;" _CRLF, i); + } + src->addFmt("activeMaskStack[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[0] = true;" _CRLF); + src->addFmt("activeMaskStackC[1] = true;" _CRLF); + // generate vars for each subroutine + for (auto& subroutineInfo : shaderContext->list_subroutines) + { + sint32 subroutineMaxStackDepth = 0; + src->addFmt("bool activeMaskStackSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 1); + src->addFmt("bool activeMaskStackCSub{:04x}[{}];" _CRLF, subroutineInfo.cfAddr, subroutineMaxStackDepth + 2); + } + } + // helper variables for cube maps (todo: Only emit when used) + if (shaderContext->analyzer.hasRedcCUBE) + { + src->add("float3 cubeMapSTM;" _CRLF); + src->add("int cubeMapFaceId;" _CRLF); + } + for(sint32 i=0; ioutput->textureUnitMask[i]) + continue; + if( shader->textureUnitDim[i] != Latte::E_DIM::DIM_CUBEMAP ) + continue; + src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); + } + // init base offset for streamout buffer writes + if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) + { + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if(!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i]&3) == 0); + + if (shader->shaderType == LatteConst::ShaderType::Vertex) // vertex shader + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (vid + supportBuffer.verticesPerInstance * iid)*{};" _CRLF, i, i, shaderContext->output->streamoutBufferStride[i] / 4); + else // geometry shader + { + uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + uint32 bytesPerVertex = shaderContext->contextRegisters[mmSQ_GS_VERT_ITEMSIZE] * 4; + uint32 maxVerticesInGS = ((shaderContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) * 4) / bytesPerVertex; + + cemu_assert_debug(gsOutPrimType == 0); // currently we only properly handle GS output primitive points + + src->addFmt("int sbBase{} = supportBuffer.streamoutBufferBase{}/4 + (gl_PrimitiveIDIn * {})*{};" _CRLF, i, i, maxVerticesInGS, shaderContext->output->streamoutBufferStride[i] / 4); + } + } + + } + // code to load inputs from previous stage + if( shader->shaderType == LatteConst::ShaderType::Vertex ) + { + if( (shaderContext->analyzer.gprUseMask[0/8]&(1<<(0%8))) != 0 ) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = int4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = float4(vid, 0, 0, iid);" _CRLF, _getRegisterVarName(shaderContext, 0)); // TODO: as_type(float4(vid, 0, 0, iid))? + else + cemu_assert_unimplemented(); + } + + LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; + for(auto& bufferGroup : parsedFetchShader->bufferGroups) + { + for(sint32 i=0; ibufferGroupsInvalid) + { + // these attributes point to non-existent buffers + // todo - figure out how the hardware actually handles this, currently we assume the input values are zero + for (sint32 i = 0; i < bufferGroup.attribCount; i++) + LatteDecompiler_emitAttributeImport(shaderContext, bufferGroup.attrib[i]); + } + } + else if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 psControl1 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_1]; + + uint32 spiInterpControl = shaderContext->contextRegisters[mmSPI_INTERP_CONTROL_0]; + uint8 spriteEnable = (spiInterpControl >> 1) & 1; + cemu_assert_debug(spriteEnable == 0); + + uint8 frontFace_enabled = (psControl1 >> 8) & 1; + uint8 frontFace_chan = (psControl1 >> 9) & 3; + uint8 frontFace_allBits = (psControl1 >> 11) & 1; + uint8 frontFace_regIndex = (psControl1 >> 12) & 0x1F; + + // handle param_gen + if (psInputTable->paramGen != 0) + { + cemu_assert_debug((psInputTable->paramGen) == 1); // handle the other bits (the same set of coordinates with different perspective/projection settings?) + uint32 paramGenGPRIndex = psInputTable->paramGenGPR; + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = pointCoord.xyxy;" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + else + src->addFmt("{} = as_type(pointCoord.xyxy);" _CRLF, _getRegisterVarName(shaderContext, paramGenGPRIndex)); + } + + for (sint32 i = 0; i < psInputTable->count; i++) + { + uint32 psControl0 = shaderContext->contextRegisters[mmSPI_PS_IN_CONTROL_0]; + uint32 spi0_paramGen = (psControl0 >> 15) & 0xF; + + sint32 gprIndex = i;// +spi0_paramGen + paramRegOffset; + if ((shaderContext->analyzer.gprUseMask[gprIndex / 8] & (1 << (gprIndex % 8))) == 0 && shaderContext->analyzer.usesRelativeGPRRead == false) + continue; + uint32 psInputSemanticId = psInputTable->import[i].semanticId; + if (psInputSemanticId == LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION) + { + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = GET_FRAGCOORD();" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + else + src->addFmt("{} = as_type(GET_FRAGCOORD());" _CRLF, _getRegisterVarName(shaderContext, gprIndex)); + continue; + } + + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{} = as_type(in.passParameterSem{});" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{} = in.passParameterSem{};" _CRLF, _getRegisterVarName(shaderContext, gprIndex), psInputSemanticId); + else + cemu_assert_unimplemented(); + } + // front facing attribute + if (frontFace_enabled) + { + if ((shaderContext->analyzer.gprUseMask[0 / 8] & (1 << (0 % 8))) != 0) + { + if (frontFace_allBits) + cemu_assert_debug(false); + if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_SIGNED_INT) + src->addFmt("{}.{} = as_type(frontFacing ? 1.0 : 0.0);" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else if (shaderContext->typeTracker.defaultDataType == LATTE_DECOMPILER_DTYPE_FLOAT) + src->addFmt("{}.{} = frontFacing ? 1.0 : 0.0;" _CRLF, _getRegisterVarName(shaderContext, frontFace_regIndex), _getElementStrByIndex(frontFace_chan)); + else + cemu_assert_debug(false); + } + } + } + for(auto& cfInstruction : shaderContext->cfInstructions) + LatteDecompiler_emitClauseCodeMSL(shaderContext, &cfInstruction, false); + //if(shader->shaderType == LatteConst::ShaderType::Geometry) + // src->add("EndPrimitive();" _CRLF); + // vertex shader should write renderstate point size at the end if required but not modified by shader + if (shaderContext->analyzer.outputPointSize && !shaderContext->analyzer.writesPointSize) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader && rasterizationEnabled) + src->add("out.pointSize = supportBuffer.pointSize;" _CRLF); + } + + if (usesGeometryShader && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + { + if (shader->shaderType == LatteConst::ShaderType::Vertex) + { + src->add("if (tid == 0) {" _CRLF); + src->add("meshGridProperties.set_threadgroups_per_grid(uint3(1, 1, 1));" _CRLF); + src->add("}" _CRLF); + } + else if (shader->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("mesh.set_primitive_count(GET_PRIMITIVE_COUNT(vertexIndex));" _CRLF); + + // Set indices + if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 1) // Line strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 2; i++) {" _CRLF); + src->add("mesh.set_index(i, (i 2 3) + i % 2);" _CRLF); + src->add("}" _CRLF); + } + else if (shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE] == 2) // Triangle strip + { + src->add("for (uint8_t i = 0; i < GET_PRIMITIVE_COUNT(vertexIndex) * 3; i++) {" _CRLF); + src->add("mesh.set_index(i, (i / 3) + i % 3);" _CRLF); + src->add("}" _CRLF); + } + else + { + src->add("for (uint8_t i = 0; i < vertexIndex; i++) {" _CRLF); + src->add("mesh.set_index(i, i);" _CRLF); + src->add("}" _CRLF); + } + } + } + + if (rasterizationEnabled && (!usesGeometryShader || shader->shaderType == LatteConst::ShaderType::Pixel)) + { + // Return + src->add("return out;" _CRLF); + } + + // end of shader main + src->add("}" _CRLF); + src->shrink_to_fit(); + shader->strBuf_shaderSource = src; +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp new file mode 100644 index 000000000..9ee5c31f1 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLAttrDecoder.cpp @@ -0,0 +1,511 @@ +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/Latte.h" +#include "Cafe/HW/Latte/Core/LatteDraw.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "util/helpers/StringBuf.h" + +#define _CRLF "\r\n" + +static void _readLittleEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xyz,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); +} + +static void _readLittleEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); +} + +static void _readBigEndianAttributeU32x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder = in.attrDataSem{};" _CRLF, attributeInputIndex); + src->add("attrDecoder = (attrDecoder>>24)|((attrDecoder>>8)&0xFF00)|((attrDecoder<<8)&0xFF0000)|((attrDecoder<<24));" _CRLF); +} + +static void _readBigEndianAttributeU32x3(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyz = in.attrDataSem{}.xyz;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xyz = (attrDecoder.xyz>>24)|((attrDecoder.xyz>>8)&0xFF00)|((attrDecoder.xyz<<8)&0xFF0000)|((attrDecoder.xyz<<24));" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = (attrDecoder.xy>>24)|((attrDecoder.xy>>8)&0xFF00)|((attrDecoder.xy<<8)&0xFF0000)|((attrDecoder.xy<<24));" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU32x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.x = in.attrDataSem{}.x;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = (attrDecoder.x>>24)|((attrDecoder.x>>8)&0xFF00)|((attrDecoder.x<<8)&0xFF0000)|((attrDecoder.x<<24));" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x1(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.x = ((attrDecoder.x>>8)&0xFF)|((attrDecoder.x<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.y = 0;" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x2(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("attrDecoder.xy = ((attrDecoder.xy>>8)&0xFF)|((attrDecoder.xy<<8)&0xFF00);" _CRLF); + src->add("attrDecoder.z = 0;" _CRLF); + src->add("attrDecoder.w = 0;" _CRLF); +} + +static void _readBigEndianAttributeU16x4(LatteDecompilerShader* shaderContext, StringBuf* src, uint32 attributeInputIndex) +{ + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("attrDecoder = ((attrDecoder>>8)&0xFF)|((attrDecoder<<8)&0xFF00);" _CRLF); +} + +void LatteDecompiler_emitAttributeDecodeMSL(LatteDecompilerShader* shaderContext, StringBuf* src, LatteParsedFetchShaderAttribute_t* attrib) +{ + if (attrib->attributeBufferIndex >= Latte::GPU_LIMITS::NUM_VERTEX_BUFFERS) + { + src->add("attrDecoder = int4(0);" _CRLF); + return; + } + + uint32 attributeInputIndex = attrib->semanticId; + if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U32 ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 ) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + // Bayonetta 2 uses this format to store normals + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + if (attrib->isSigned != 0) + { + src->add("if( (attrDecoder.x&0x200) != 0 ) attrDecoder.x |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.y&0x200) != 0 ) attrDecoder.y |= 0xFFFFFC00;" _CRLF); + src->add("if( (attrDecoder.z&0x200) != 0 ) attrDecoder.z |= 0xFFFFFC00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/511.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/511.0,-1.0));" _CRLF); + } + else + { + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + } + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // unsure? + + } + else if( attrib->format == FMT_32_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_32_32 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 1) + { + // we can just read the signed s32 as a u32 since no sign-extension is necessary + _readBigEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + // seen in Minecraft Wii U Edition + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.wzyx;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in Ben 10 Omniverse + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.wzyx));" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_unimplemented(); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_NONE ) + { + if( attrib->format == FMT_32_32_32_32_FLOAT && attrib->nfa == 2 ) + { + _readLittleEndianAttributeU32x4(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_32_FLOAT && attrib->nfa == 2) + { + _readLittleEndianAttributeU32x3(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32_32_FLOAT && attrib->nfa == 2) + { + // seen in Cities of Gold + _readLittleEndianAttributeU32x2(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_32 && attrib->nfa == 1 && attrib->isSigned == 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_2_10_10_10 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in Fast Racing Neo + _readLittleEndianAttributeU32x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xyzw = uint4((attrDecoder.x>>0)&0x3FF,(attrDecoder.x>>10)&0x3FF,(attrDecoder.x>>20)&0x3FF,(attrDecoder.x>>30)&0x3);" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/1023.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(float(attrDecoder.w));" _CRLF); // todo - is this correct? + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in CoD ghosts + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned == 1 ) + { + // seen in Rabbids Land + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xyzw = as_type(float4(int4(attrDecoder)));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); + } + else if (attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + // seen in Nano Assault Neo + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2) + { + // seen in Giana Sisters: Twisted Dreams + _readLittleEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw)/255.0);" _CRLF, attributeInputIndex); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/127.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8_8_8 && attrib->nfa == 1 && attrib->isSigned != 0) + { + // seen in Sonic Lost World + src->addFmt("attrDecoder.xyzw = in.attrDataSem{}.xyzw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.z&0x80) != 0 ) attrDecoder.z |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.w&0x80) != 0 ) attrDecoder.w |= 0xFFFFFF00;" _CRLF); + } + else if( attrib->format == FMT_8_8_8_8 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + // seen in One Piece + // TODO: uint4? + src->addFmt("attrDecoder.xyzw = as_type(float4(in.attrDataSem{}.xyzw));" _CRLF, attributeInputIndex); + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned == 0) + { + if( (attrib->offset&3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL ) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 2 && attrib->isSigned == 0) + { + // seen in BotW + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.zw));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = as_type(float2(in.attrDataSem{}.xy));" _CRLF, attributeInputIndex); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 0 && attrib->isSigned != 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xy = in.attrDataSem{}.zw;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else + { + src->addFmt("attrDecoder.xy = in.attrDataSem{}.xy;" _CRLF, attributeInputIndex); + src->add("if( (attrDecoder.x&0x80) != 0 ) attrDecoder.x |= 0xFFFFFF00;" _CRLF); + src->add("if( (attrDecoder.y&0x80) != 0 ) attrDecoder.y |= 0xFFFFFF00;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/127.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + } + else if (attrib->format == FMT_8_8 && attrib->nfa == 1 && attrib->isSigned == 0) + { + if ((attrib->offset & 3) == 2 && LatteGPUState.glVendor == GLVENDOR_AMD && g_renderer->GetType() == RendererAPI::OpenGL) + { + // AMD workaround + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.zw,0,0);" _CRLF, attributeInputIndex); + } + else + { + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.xy,0,0);" _CRLF, attributeInputIndex); + } + } + else if( attrib->format == FMT_8 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + // seen in Pikmin 3 + src->addFmt("attrDecoder.x = as_type(float(in.attrDataSem{}.x)/255.0);" _CRLF, attributeInputIndex); + src->add("attrDecoder.yzw = uint3(0);" _CRLF); + } + else if( attrib->format == FMT_8 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + src->addFmt("attrDecoder.xyzw = uint4(in.attrDataSem{}.x,0,0,0);" _CRLF, attributeInputIndex); + } + else + { + cemuLog_log(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}\n", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + cemu_assert_debug(false); + } + } + else if( attrib->endianSwap == LatteConst::VertexFetchEndianMode::SWAP_U16 ) + { + if( attrib->format == FMT_16_16_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + // TODO: uint4? + src->add("attrDecoder.xyzw = as_type(float4(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))),float2(as_type(attrDecoder.z|(attrDecoder.w<<16)))));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned != 0) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.z = as_type(max(float(int(attrDecoder.z))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.w = as_type(max(float(int(attrDecoder.w))/32767.0,-1.0));" _CRLF); + } + else if (attrib->format == FMT_16_16_16_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in BotW + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z))/65535.0);" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w))/65535.0);" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x)));" _CRLF); + src->add("attrDecoder.y = as_type(float(int(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.z = as_type(float(int(attrDecoder.z)));" _CRLF); + src->add("attrDecoder.w = as_type(float(int(attrDecoder.w)));" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + // seen in Minecraft Wii U Edition + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.z&0x8000) != 0 ) attrDecoder.z |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.w&0x8000) != 0 ) attrDecoder.w |= 0xFFFF0000;" _CRLF); + } + else if( attrib->format == FMT_16_16_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x4(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16_FLOAT && attrib->nfa == 2 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(as_type(attrDecoder.x|(attrDecoder.y<<16))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y))/65535.0);" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 0 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.x = as_type(max(float(int(attrDecoder.x))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.y = as_type(max(float(int(attrDecoder.y))/32767.0,-1.0));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 1 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned == 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.xy = as_type(float2(float(attrDecoder.x), float(attrDecoder.y)));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if( attrib->format == FMT_16_16 && attrib->nfa == 2 && attrib->isSigned != 0 ) + { + _readBigEndianAttributeU16x2(shaderContext, src, attributeInputIndex); + src->add("if( (attrDecoder.x&0x8000) != 0 ) attrDecoder.x |= 0xFFFF0000;" _CRLF); + src->add("if( (attrDecoder.y&0x8000) != 0 ) attrDecoder.y |= 0xFFFF0000;" _CRLF); + src->add("attrDecoder.xy = as_type(float2(float(int(attrDecoder.x)), float(int(attrDecoder.y))));" _CRLF); + src->add("attrDecoder.zw = uint2(0);" _CRLF); + } + else if (attrib->format == FMT_16 && attrib->nfa == 1 && attrib->isSigned == 0) + { + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + } + else if (attrib->format == FMT_16 && attrib->nfa == 0 && attrib->isSigned == 0) + { + // seen in CoD ghosts + _readBigEndianAttributeU16x1(shaderContext, src, attributeInputIndex); + src->add("attrDecoder.x = as_type(float(int(attrDecoder.x))/65535.0);" _CRLF); + } + else + { + cemuLog_logDebug(LogType::Force, "_emitAttributeDecode(): Unsupported fmt {:02x} nfa {} signed {} endian {}", attrib->format, attrib->nfa, attrib->isSigned, attrib->endianSwap); + } + } + else + { + cemu_assert_debug(false); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp new file mode 100644 index 000000000..ab8906718 --- /dev/null +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -0,0 +1,571 @@ +#pragma once + +#include "Common/precompiled.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/Core/LatteShader.h" + +namespace LatteDecompiler +{ + static void _emitUniformVariables(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader) + { + auto src = decompilerContext->shaderSource; + + auto& uniformOffsets = decompilerContext->output->uniformOffsetsVK; + + src->add("struct SupportBuffer {" _CRLF); + + sint32 uniformCurrentOffset = 0; + auto shader = decompilerContext->shader; + auto shaderType = decompilerContext->shader->shaderType; + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // uniform registers or buffers are accessed statically with predictable offsets + // this allows us to remap the used entries into a more compact array + src->addFmt("int4 remapped[{}];" _CRLF, (sint32)shader->list_remappedUniformEntries.size()); + uniformOffsets.offset_remapped = uniformCurrentOffset; + uniformCurrentOffset += 16 * shader->list_remappedUniformEntries.size(); + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(decompilerContext->shaderBaseHash, 256); + // full or partial uniform register file has to be present + src->addFmt("int4 uniformRegister[{}];" _CRLF, cfileSize); + uniformOffsets.offset_uniformRegister = uniformCurrentOffset; + uniformOffsets.count_uniformRegister = cfileSize; + uniformCurrentOffset += 16 * cfileSize; + } + // special uniforms + bool hasAnyViewportScaleDisabled = + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || + !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && hasAnyViewportScaleDisabled) + { + // aka GX2 special state 0 + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + src->add("float2 windowSpaceToClipSpaceTransform;" _CRLF); + uniformOffsets.offset_windowSpaceToClipSpaceTransform = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + bool alphaTestEnable = decompilerContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); + if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + src->add("float alphaTestRef;" _CRLF); + uniformOffsets.offset_alphaTestRef = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + if (decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) + { + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + uniformCurrentOffset = (uniformCurrentOffset + 3)&~3; + src->add("float pointSize;" _CRLF); + uniformOffsets.offset_pointSize = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + // define fragCoordScale which holds the xy scale for render target resolution vs effective resolution + if (shader->shaderType == LatteConst::ShaderType::Pixel) + { + uniformCurrentOffset = (uniformCurrentOffset + 7)&~7; + src->add("float2 fragCoordScale;" _CRLF); + uniformOffsets.offset_fragCoordScale = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // provide scale factor for every texture that is accessed via texel coordinates (texelFetch) + for (sint32 t = 0; t < LATTE_NUM_MAX_TEX_UNITS; t++) + { + if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) + continue; + uniformCurrentOffset = (uniformCurrentOffset + 7) & ~7; + src->addFmt("float2 tex{}Scale;" _CRLF, t); + uniformOffsets.offset_texScale[t] = uniformCurrentOffset; + uniformCurrentOffset += 8; + } + // define verticesPerInstance + streamoutBufferBaseX + if ((shader->shaderType == LatteConst::ShaderType::Vertex && + (decompilerContext->options->usesGeometryShader || isRectVertexShader)) || + (decompilerContext->analyzer.useSSBOForStreamout && + (shader->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || + (shader->shaderType == LatteConst::ShaderType::Geometry))) + { + src->add("int verticesPerInstance;" _CRLF); + uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; + uniformCurrentOffset += 4; + for (uint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) + { + if (decompilerContext->output->streamoutBufferWriteMask[i]) + { + src->addFmt("int streamoutBufferBase{};" _CRLF, i); + uniformOffsets.offset_streamoutBufferBase[i] = uniformCurrentOffset; + uniformCurrentOffset += 4; + } + } + } + + src->add("};" _CRLF _CRLF); + + uniformOffsets.offset_endOfBlock = uniformCurrentOffset; + } + + static void _emitUniformBuffers(LatteDecompilerShaderContext* decompilerContext) + { + auto shaderSrc = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); + + shaderSrc->addFmt("struct UBuff{} {{" _CRLF, i); + shaderSrc->addFmt("float4 d[{}];" _CRLF, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(decompilerContext->shaderBaseHash, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); + shaderSrc->add("};" _CRLF _CRLF); + } + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) + { + // already generated in _emitUniformVariables + } + else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_NONE) + { + // no uniforms used + } + else + { + cemu_assert_debug(false); + } + } + + static void _emitAttributes(LatteDecompilerShaderContext* decompilerContext, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + std::string attributeNames; + + if (decompilerContext->shader->shaderType == LatteConst::ShaderType::Vertex) + { + src->add("struct VertexIn {" _CRLF); + // attribute inputs + for (uint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) + { + if (decompilerContext->analyzer.inputAttributSemanticMask[i]) + { + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.attributeMapping[i] >= 0); + + src->addFmt("uint4 attrDataSem{}", i); + if (fetchVertexManually) + attributeNames += "#define ATTRIBUTE_NAME" + std::to_string((sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]) + " attrDataSem" + std::to_string(i) + "\n"; + else + src->addFmt(" [[attribute({})]]", (sint32)decompilerContext->output->resourceMappingMTL.attributeMapping[i]); + src->add(";" _CRLF); + } + } + src->add("};" _CRLF _CRLF); + } + src->addFmt("{}", attributeNames); + } + + static void _emitVSOutputs(LatteDecompilerShaderContext* shaderContext, bool isRectVertexShader) + { + auto* src = shaderContext->shaderSource; + + src->add("struct VertexOut {" _CRLF); + src->add("float4 position [[position]] [[invariant]];" _CRLF); + if (shaderContext->analyzer.outputPointSize) + src->add("float pointSize [[point_size]];" _CRLF); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + auto parameterMask = shaderContext->shader->outputParameterMask; + bool psInputsWritten[GPU7_PS_MAX_INPUTS] = {false}; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask&(1 << i)) == 0) + continue; + uint32 vsSemanticId = _getVertexShaderOutParamSemanticId(shaderContext->contextRegisters, i); + if (vsSemanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + // get import based on semanticId + sint32 psInputIndex = -1; + for (sint32 f = 0; f < psInputTable->count; f++) + { + if (psInputTable->import[f].semanticId == vsSemanticId) + { + psInputIndex = f; + break; + } + } + if (psInputIndex == -1) + continue; // no ps input + + psInputsWritten[psInputIndex] = true; + + src->addFmt("float4 passParameterSem{}", psInputTable->import[psInputIndex].semanticId); + if (!isRectVertexShader) + { + src->addFmt(" [[user(locn{})]]", psInputIndex); + if (psInputTable->import[psInputIndex].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[psInputIndex].isNoPerspective) + src->add(" [[center_no_perspective]]"); + } + src->addFmt(";" _CRLF); + } + + // TODO: handle this in the fragment shader instead? + // Declare all PS inputs that are not written by the VS + for (uint32 i = 0; i < psInputTable->count; i++) + { + if (psInputsWritten[i]) + continue; + + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + + src->addFmt("float4 unknown{} [[user(locn{})]];" _CRLF, psInputTable->import[i].semanticId, i); + } + + src->add("};" _CRLF _CRLF); + + if (isRectVertexShader) + { + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); + src->add("};" _CRLF _CRLF); + } + } + + static void _emitPSInputs(LatteDecompilerShaderContext* shaderContext) + { + auto* src = shaderContext->shaderSource; + + src->add("#define GET_FRAGCOORD() float4(in.position.xy * supportBuffer.fragCoordScale.xy, in.position.z, 1.0 / in.position.w)" _CRLF); + + src->add("struct FragmentIn {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + + LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); + for (sint32 i = 0; i < psInputTable->count; i++) + { + if (psInputTable->import[i].semanticId > LATTE_ANALYZER_IMPORT_INDEX_PARAM_MAX) + continue; + src->addFmt("float4 passParameterSem{}", psInputTable->import[i].semanticId); + src->addFmt(" [[user(locn{})]]", i); + if (psInputTable->import[i].isFlat) + src->add(" [[flat]]"); + if (psInputTable->import[i].isNoPerspective) + src->add(" [[center_no_perspective]]"); + src->add(";" _CRLF); + } + + src->add("};" _CRLF _CRLF); + } + + static void _emitInputsAndOutputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually, bool rasterizationEnabled) + { + auto src = decompilerContext->shaderSource; + + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + _emitAttributes(decompilerContext, fetchVertexManually); + } + else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) + { + _emitPSInputs(decompilerContext); + + src->add("struct FragmentOut {" _CRLF); + + // generate pixel outputs for pixel shader + for (uint32 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + if ((decompilerContext->shader->pixelColorOutputMask & (1 << i)) != 0) + { + auto dataType = GetColorBufferDataType(i, *decompilerContext->contextRegistersNew); + if (dataType != MetalDataType::NONE) + { + src->addFmt("{} passPixelColor{} [[color({})]];" _CRLF, GetDataTypeStr(dataType), i, i); + } + } + } + + // generate depth output for pixel shader + if (decompilerContext->shader->depthMask) + src->add("float passDepth [[depth(any)]];" _CRLF); + + src->add("};" _CRLF _CRLF); + } + + if (!decompilerContext->options->usesGeometryShader) + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && rasterizationEnabled) + _emitVSOutputs(decompilerContext, isRectVertexShader); + } + else + { + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + src->add("struct VertexOut {" _CRLF); + uint32 ringParameterCountVS2GS = 0; + if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCount; + } + else + { + ringParameterCountVS2GS = decompilerContext->shader->ringParameterCountFromPrevStage; + } + for (uint32 f = 0; f < ringParameterCountVS2GS; f++) + src->addFmt("int4 passParameterSem{};" _CRLF, f); + src->add("};" _CRLF _CRLF); + src->add("struct ObjectPayload {" _CRLF); + src->add("VertexOut vertexOut[VERTICES_PER_VERTEX_PRIMITIVE];" _CRLF); + src->add("};" _CRLF _CRLF); + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + // parameters shared between geometry and pixel shader + uint32 ringItemSize = decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF; + if ((ringItemSize & 0xF) != 0) + debugBreakpoint(); + if (((decompilerContext->contextRegisters[mmSQ_GSVS_RING_ITEMSIZE] & 0x7FFF) & 0xF) != 0) + debugBreakpoint(); + + src->add("struct GeometryOut {" _CRLF); + src->add("float4 position [[position]];" _CRLF); + for (sint32 p = 0; p < decompilerContext->parsedGSCopyShader->numParam; p++) + { + if (decompilerContext->parsedGSCopyShader->paramMapping[p].exportType != 2) + continue; + src->addFmt("float4 passParameterSem{} [[user(locn{})]];" _CRLF, (sint32)decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam, decompilerContext->parsedGSCopyShader->paramMapping[p].exportParam & 0x7F); + } + src->add("};" _CRLF _CRLF); + + const uint32 MAX_VERTEX_COUNT = 32; + + // Define the mesh shader output type + src->addFmt("using MeshType = mesh;" _CRLF, MAX_VERTEX_COUNT, MAX_VERTEX_COUNT); + } + } + } + + static void emitHeader(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually, bool rasterizationEnabled) + { + auto src = decompilerContext->shaderSource; + + if ((decompilerContext->options->usesGeometryShader || isRectVertexShader) && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex || decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) + { + LattePrimitiveMode vsOutPrimType = static_cast(decompilerContext->contextRegisters[mmVGT_PRIMITIVE_TYPE]); + uint32 gsOutPrimType = decompilerContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; + + switch (vsOutPrimType) + { + case LattePrimitiveMode::POINTS: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 1" _CRLF); + break; + case LattePrimitiveMode::LINES: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 2" _CRLF); + break; + case LattePrimitiveMode::TRIANGLES: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); + break; + case LattePrimitiveMode::RECTS: + src->add("#define VERTICES_PER_VERTEX_PRIMITIVE 3" _CRLF); + break; + default: + cemuLog_log(LogType::Force, "Unknown vertex out primitive type {}", vsOutPrimType); + break; + } + if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + switch (gsOutPrimType) + { + case 0: // Point + src->add("#define MTL_PRIMITIVE_TYPE point" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount / 1)" _CRLF); + break; + case 1: // Line strip + src->add("#define MTL_PRIMITIVE_TYPE line" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 1)" _CRLF); + break; + case 2: // Triangle strip + src->add("#define MTL_PRIMITIVE_TYPE triangle" _CRLF); + src->add("#define GET_PRIMITIVE_COUNT(vertexCount) (vertexCount - 2)" _CRLF); + break; + default: + cemuLog_log(LogType::Force, "Unknown geometry out primitive type {}", gsOutPrimType); + break; + } + } + } + + if (decompilerContext->contextRegistersNew->PA_CL_CLIP_CNTL.get_DX_CLIP_SPACE_DEF()) + src->add("#define SET_POSITION(_v) out.position = _v" _CRLF); + else + src->add("#define SET_POSITION(_v) out.position = _v; out.position.z = (out.position.z + out.position.w) / 2.0" _CRLF); + + const bool dump_shaders_enabled = ActiveSettings::DumpShadersEnabled(); + if(dump_shaders_enabled) + decompilerContext->shaderSource->add("// start of shader inputs/outputs, predetermined by Cemu. Do not touch" _CRLF); + // uniform variables + _emitUniformVariables(decompilerContext, isRectVertexShader); + // uniform buffers + _emitUniformBuffers(decompilerContext); + // inputs and outputs + _emitInputsAndOutputs(decompilerContext, isRectVertexShader, fetchVertexManually, rasterizationEnabled); + + if (dump_shaders_enabled) + decompilerContext->shaderSource->add("// end of shader inputs/outputs" _CRLF); + } + + static void _emitUniformBufferDefinitions(LatteDecompilerShaderContext* decompilerContext) + { + auto src = decompilerContext->shaderSource; + // uniform buffer definition + if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) + continue; + + cemu_assert_debug(decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i] >= 0); + + src->addFmt(", constant UBuff{}& ubuff{} [[buffer({})]]", i, i, (sint32)decompilerContext->output->resourceMappingMTL.uniformBuffersBindingPoint[i]); + } + } + } + + static void _emitTextureDefinitions(LatteDecompilerShaderContext* shaderContext) + { + bool renderTargetIndexUsed[LATTE_NUM_COLOR_TARGET] = {false}; + + auto src = shaderContext->shaderSource; + // texture sampler definition + for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) + { + if (!shaderContext->output->textureUnitMask[i]) + continue; + + uint8 renderTargetIndex = shaderContext->shader->textureRenderTargetIndex[i]; + if (static_cast(g_renderer.get())->SupportsFramebufferFetch() && renderTargetIndex != 255) + { + if (!renderTargetIndexUsed[renderTargetIndex]) + { + src->addFmt(", {} col{} [[color({})]]", GetDataTypeStr(GetColorBufferDataType(renderTargetIndex, *shaderContext->contextRegistersNew)), renderTargetIndex, renderTargetIndex); + renderTargetIndexUsed[renderTargetIndex] = true; + } + } + else + { + src->add(", "); + + // Only certain texture dimensions can be used with comparison samplers + if (shaderContext->shader->textureUsesDepthCompare[i] && IsValidDepthTextureType(shaderContext->shader->textureUnitDim[i])) + src->add("depth"); + else + src->add("texture"); + + if (shaderContext->shader->textureIsIntegerFormat[i]) + { + // integer samplers + if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else + cemu_assert_unimplemented(); + } + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D || shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_MSAA) + src->add("2d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_1D) + src->add("1d"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_2D_ARRAY) + src->add("2d_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_CUBEMAP) + src->add("cube_array"); + else if (shaderContext->shader->textureUnitDim[i] == Latte::E_DIM::DIM_3D) + src->add("3d"); + else + { + cemu_assert_unimplemented(); + } + + uint32 binding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i]; + //uint32 textureBinding = shaderContext->output->resourceMappingMTL.textureUnitToBindingPoint[i] % 31; + //uint32 samplerBinding = textureBinding % 16; + src->addFmt(" tex{} [[texture({})]]", i, binding); + src->addFmt(", sampler samplr{} [[sampler({})]]", i, binding); + } + } + } + + static void emitInputs(LatteDecompilerShaderContext* decompilerContext, bool isRectVertexShader, bool fetchVertexManually) + { + auto src = decompilerContext->shaderSource; + + switch (decompilerContext->shaderType) + { + case LatteConst::ShaderType::Vertex: + if (decompilerContext->options->usesGeometryShader || isRectVertexShader) + { + src->add("object_data ObjectPayload& objectPayload [[payload]]"); + src->add(", mesh_grid_properties meshGridProperties"); + src->add(", uint tig [[threadgroup_position_in_grid]]"); + src->add(", uint tid [[thread_index_in_threadgroup]]"); + // TODO: only include index buffer if needed + src->addFmt(", device uint* indexBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexBufferBinding); + // TODO: put into the support buffer? + src->addFmt(", constant uchar& indexType [[buffer({})]]", decompilerContext->output->resourceMappingMTL.indexTypeBinding); + } + else + { + // TODO: only include these if needed? + src->add("uint vid [[vertex_id]]"); + src->add(", uint iid [[instance_id]]"); + } + + if (fetchVertexManually) + src->add(" VERTEX_BUFFER_DEFINITIONS"); + else + src->add(", VertexIn in [[stage_in]]"); + + break; + case LatteConst::ShaderType::Geometry: + src->add("MeshType mesh"); + src->add(", const object_data ObjectPayload& objectPayload [[payload]]"); + break; + case LatteConst::ShaderType::Pixel: + src->add("FragmentIn in [[stage_in]]"); + // TODO: only include these if needed? + src->add(", float2 pointCoord [[point_coord]]"); + src->add(", bool frontFacing [[front_facing]]"); + break; + default: + break; + } + + if (decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint >= 0) + src->addFmt(", constant SupportBuffer& supportBuffer [[buffer({})]]", decompilerContext->output->resourceMappingMTL.uniformVarsBufferBindingPoint); + + // streamout buffer (transform feedback) + if ((decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || decompilerContext->shaderType == LatteConst::ShaderType::Geometry) + { + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingMTL.tfStorageBindingPoint); + } + + // uniform buffers + _emitUniformBufferDefinitions(decompilerContext); + // textures + _emitTextureDefinitions(decompilerContext); + } +} diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index ed1858bae..f4135640f 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -47,7 +47,7 @@ struct LatteDecompilerTEXInstruction sint32 dstGpr; sint8 dstSel[4]; // texture fetch - struct + struct { sint32 textureIndex{}; sint32 samplerIndex{}; @@ -216,7 +216,7 @@ struct LatteDecompilerShaderContext bool genIntReg; // if set, generate R*i register variables bool useArrayGPRs; // if set, an array is used to represent GPRs instead of individual variables }typeTracker; - // analyzer + // analyzer struct { // general @@ -260,6 +260,8 @@ struct LatteDecompilerShaderContext // emitter bool hasUniformVarBlock; sint32 currentBindingPointVK{}; + sint32 currentBufferBindingPointMTL{}; + sint32 currentTextureBindingPointMTL{}; struct ALUClauseTemporariesState* aluPVPSState{nullptr}; // misc std::vector list_subroutines; @@ -268,9 +270,10 @@ struct LatteDecompilerShaderContext void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_analyzeDataTypes(LatteDecompilerShaderContext* shaderContext); void LatteDecompiler_emitGLSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); +void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader); void LatteDecompiler_cleanup(LatteDecompilerShaderContext* shaderContext); // helper functions -sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); \ No newline at end of file +sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp new file mode 100644 index 000000000..a7e87c794 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.cpp @@ -0,0 +1,64 @@ +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +CachedFBOMtl::CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key) : LatteCachedFBO(key) +{ + m_renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + + bool hasAttachment = false; + for (int i = 0; i < 8; ++i) + { + const auto& buffer = colorBuffer[i]; + auto textureView = (LatteTextureViewMtl*)buffer.texture; + if (!textureView) + { + continue; + } + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(i); + colorAttachment->setTexture(textureView->GetRGBAView()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + hasAttachment = true; + } + + // setup depth attachment + if (depthBuffer.texture) + { + auto textureView = static_cast(depthBuffer.texture); + auto depthAttachment = m_renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(textureView->GetRGBAView()); + depthAttachment->setLoadAction(MTL::LoadActionLoad); + depthAttachment->setStoreAction(MTL::StoreActionStore); + + // setup stencil attachment + if (depthBuffer.hasStencil && GetMtlPixelFormatInfo(depthBuffer.texture->format, true).hasStencil) + { + auto stencilAttachment = m_renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(textureView->GetRGBAView()); + stencilAttachment->setLoadAction(MTL::LoadActionLoad); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + } + + hasAttachment = true; + } + + // HACK: setup a dummy color attachment to prevent Metal from discarding draws for stremout draws in Super Smash Bros. for Wii U (works fine on MoltenVK without this hack though) + if (!hasAttachment) + { + auto colorAttachment = m_renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(metalRenderer->GetNullTexture2D()); + colorAttachment->setLoadAction(MTL::LoadActionDontCare); + colorAttachment->setStoreAction(MTL::StoreActionDontCare); + } + + // Visibility buffer + m_renderPassDescriptor->setVisibilityResultBuffer(metalRenderer->GetOcclusionQueryResultBuffer()); +} + +CachedFBOMtl::~CachedFBOMtl() +{ + m_renderPassDescriptor->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h new file mode 100644 index 000000000..f1221eb22 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" +#include "Cafe/HW/Latte/Core/LatteCachedFBO.h" + +class CachedFBOMtl : public LatteCachedFBO +{ +public: + CachedFBOMtl(class MetalRenderer* metalRenderer, uint64 key); + + ~CachedFBOMtl(); + + MTL::RenderPassDescriptor* GetRenderPassDescriptor() + { + return m_renderPassDescriptor; + } + +private: + MTL::RenderPassDescriptor* m_renderPassDescriptor = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp new file mode 100644 index 000000000..3c0005efc --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp @@ -0,0 +1,108 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, + Latte::E_HWTILEMODE tileMode, bool isDepth) + : LatteTexture(dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth), m_mtlr(mtlRenderer) +{ + MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setStorageMode(MTL::StorageModePrivate); + //desc->setCpuCacheMode(MTL::CPUCacheModeWriteCombined); + + sint32 effectiveBaseWidth = width; + sint32 effectiveBaseHeight = height; + sint32 effectiveBaseDepth = depth; + if (overwriteInfo.hasResolutionOverwrite) + { + effectiveBaseWidth = overwriteInfo.width; + effectiveBaseHeight = overwriteInfo.height; + effectiveBaseDepth = overwriteInfo.depth; + } + effectiveBaseWidth = std::max(1, effectiveBaseWidth); + effectiveBaseHeight = std::max(1, effectiveBaseHeight); + effectiveBaseDepth = std::max(1, effectiveBaseDepth); + + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + effectiveBaseHeight = 1; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + cemu_assert_debug(effectiveBaseDepth % 6 == 0 && "cubemaps must have an array length multiple of 6"); + + textureType = MTL::TextureTypeCubeArray; + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + desc->setTextureType(textureType); + + // Clamp mip levels + mipLevels = std::min(mipLevels, (uint32)maxPossibleMipLevels); + mipLevels = std::max(mipLevels, (uint32)1); + + desc->setWidth(effectiveBaseWidth); + desc->setHeight(effectiveBaseHeight); + desc->setMipmapLevelCount(mipLevels); + + if (textureType == MTL::TextureType3D) + { + desc->setDepth(effectiveBaseDepth); + } + else if (textureType == MTL::TextureTypeCubeArray) + { + desc->setArrayLength(effectiveBaseDepth / 6); + } + else if (textureType == MTL::TextureType2DArray) + { + desc->setArrayLength(effectiveBaseDepth); + } + + auto pixelFormat = GetMtlPixelFormat(format, isDepth); + desc->setPixelFormat(pixelFormat); + + MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView; + if (FormatIsRenderable(format)) + usage |= MTL::TextureUsageRenderTarget; + desc->setUsage(usage); + + m_texture = mtlRenderer->GetDevice()->newTexture(desc); + desc->release(); +} + +LatteTextureMtl::~LatteTextureMtl() +{ + m_texture->release(); +} + +LatteTextureView* LatteTextureMtl::CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) +{ + cemu_assert_debug(mipCount > 0); + cemu_assert_debug(sliceCount > 0); + cemu_assert_debug((firstMip + mipCount) <= this->mipLevels); + cemu_assert_debug((firstSlice + sliceCount) <= this->depth); + + return new LatteTextureViewMtl(m_mtlr, this, dim, format, firstMip, mipCount, firstSlice, sliceCount); +} + +// TODO: lazy allocation? +void LatteTextureMtl::AllocateOnHost() +{ + // The texture is already allocated +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h new file mode 100644 index 000000000..884a5c5b0 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" +#include "HW/Latte/ISA/LatteReg.h" +#include "util/ChunkedHeap/ChunkedHeap.h" + +class LatteTextureMtl : public LatteTexture +{ +public: + LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, + uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth); + ~LatteTextureMtl(); + + MTL::Texture* GetTexture() const { + return m_texture; + } + + void AllocateOnHost() override; + +protected: + LatteTextureView* CreateView(Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) override; + +private: + class MetalRenderer* m_mtlr; + + MTL::Texture* m_texture; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp new file mode 100644 index 000000000..405c49df4 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -0,0 +1,52 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +LatteTextureReadbackInfoMtl::~LatteTextureReadbackInfoMtl() +{ + if (m_commandBuffer) + m_commandBuffer->release(); +} + +void LatteTextureReadbackInfoMtl::StartTransfer() +{ + cemu_assert(m_textureView); + + auto* baseTexture = (LatteTextureMtl*)m_textureView->baseTexture; + + cemu_assert_debug(m_textureView->firstSlice == 0); + cemu_assert_debug(m_textureView->firstMip == 0); + cemu_assert_debug(m_textureView->baseTexture->dim != Latte::E_DIM::DIM_3D); + + size_t bytesPerRow = GetMtlTextureBytesPerRow(baseTexture->format, baseTexture->isDepth, baseTexture->width); + size_t bytesPerImage = GetMtlTextureBytesPerImage(baseTexture->format, baseTexture->isDepth, baseTexture->height, bytesPerRow); + + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); + + m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); + // TODO: uncomment? + //m_mtlr->RequestSoonCommit(); + m_mtlr->CommitCommandBuffer(); +} + +bool LatteTextureReadbackInfoMtl::IsFinished() +{ + // Command buffer wasn't even comitted, let's commit immediately + //if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) + // m_mtlr->CommitCommandBuffer(); + + return CommandBufferCompleted(m_commandBuffer); +} + +void LatteTextureReadbackInfoMtl::ForceFinish() +{ + m_commandBuffer->waitUntilCompleted(); +} + +uint8* LatteTextureReadbackInfoMtl::GetData() +{ + return (uint8*)m_mtlr->GetTextureReadbackBuffer()->contents() + m_bufferOffset; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h new file mode 100644 index 000000000..19ca6574a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h @@ -0,0 +1,25 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h" + +class LatteTextureReadbackInfoMtl : public LatteTextureReadbackInfo +{ +public: + LatteTextureReadbackInfoMtl(class MetalRenderer* mtlRenderer, LatteTextureView* textureView, uint32 bufferOffset) : LatteTextureReadbackInfo(textureView), m_mtlr{mtlRenderer}, m_bufferOffset{bufferOffset} {} + ~LatteTextureReadbackInfoMtl(); + + void StartTransfer() override; + + bool IsFinished() override; + void ForceFinish() override; + + uint8* GetData() override; + +private: + class MetalRenderer* m_mtlr; + + MTL::CommandBuffer* m_commandBuffer = nullptr; + + uint32 m_bufferOffset = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp new file mode 100644 index 000000000..a06b11f02 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.cpp @@ -0,0 +1,191 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLTexture.hpp" + +uint32 LatteTextureMtl_AdjustTextureCompSel(Latte::E_GX2SURFFMT format, uint32 compSel) +{ + switch (format) + { + case Latte::E_GX2SURFFMT::R8_UNORM: // R8 is replicated on all channels (while OpenGL would return 1.0 for BGA instead) + case Latte::E_GX2SURFFMT::R8_SNORM: // probably the same as _UNORM, but needs testing + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM: // order of components is reversed (RGBA -> ABGR) + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::BC4_UNORM: + case Latte::E_GX2SURFFMT::BC4_SNORM: + if (compSel >= 1 && compSel <= 3) + compSel = 0; + break; + case Latte::E_GX2SURFFMT::BC5_UNORM: + case Latte::E_GX2SURFFMT::BC5_SNORM: + // RG maps to RG + // B maps to ? + // A maps to G (guessed) + if (compSel == 3) + compSel = 1; // read Alpha as Green + break; + case Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM: + // reverse components (Wii U: ABGR, OpenGL: RGBA) + // used in Resident Evil Revelations + if (compSel >= 0 && compSel <= 3) + compSel = 3 - compSel; + break; + case Latte::E_GX2SURFFMT::X24_G8_UINT: + // map everything to alpha? + if (compSel >= 0 && compSel <= 3) + compSel = 3; + break; + case Latte::E_GX2SURFFMT::R4_G4_UNORM: + // red and green swapped + if (compSel == 0) + compSel = 1; + else if (compSel == 1) + compSel = 0; + break; + default: + break; + } + return compSel; +} + +LatteTextureViewMtl::LatteTextureViewMtl(MetalRenderer* mtlRenderer, LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount) + : LatteTextureView(texture, firstMip, mipCount, firstSlice, sliceCount, dim, format), m_mtlr(mtlRenderer), m_baseTexture(texture) +{ + m_rgbaView = CreateSwizzledView(RGBA_SWIZZLE); +} + +LatteTextureViewMtl::~LatteTextureViewMtl() +{ + m_rgbaView->release(); + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + if (m_viewCache[i].key != INVALID_SWIZZLE) + m_viewCache[i].texture->release(); + } + + for (auto& [key, texture] : m_fallbackViewCache) + { + texture->release(); + } +} + +MTL::Texture* LatteTextureViewMtl::GetSwizzledView(uint32 gpuSamplerSwizzle) +{ + // Mask out + gpuSamplerSwizzle &= 0x0FFF0000; + + // RGBA swizzle == no swizzle + if (gpuSamplerSwizzle == RGBA_SWIZZLE) + { + return m_rgbaView; + } + + // First, try to find a view in the cache + + // Fast cache + sint32 freeIndex = -1; + for (sint32 i = 0; i < std::size(m_viewCache); i++) + { + const auto& entry = m_viewCache[i]; + if (entry.key == gpuSamplerSwizzle) + { + return entry.texture; + } + else if (entry.key == INVALID_SWIZZLE && freeIndex == -1) + { + freeIndex = i; + } + } + + // Fallback cache + auto& fallbackEntry = m_fallbackViewCache[gpuSamplerSwizzle]; + if (fallbackEntry) + { + return fallbackEntry; + } + + MTL::Texture* texture = CreateSwizzledView(gpuSamplerSwizzle); + if (freeIndex != -1) + m_viewCache[freeIndex] = {gpuSamplerSwizzle, texture}; + else + fallbackEntry = texture; + + return texture; +} + +MTL::Texture* LatteTextureViewMtl::CreateSwizzledView(uint32 gpuSamplerSwizzle) +{ + uint32 compSelR = (gpuSamplerSwizzle >> 16) & 0x7; + uint32 compSelG = (gpuSamplerSwizzle >> 19) & 0x7; + uint32 compSelB = (gpuSamplerSwizzle >> 22) & 0x7; + uint32 compSelA = (gpuSamplerSwizzle >> 25) & 0x7; + compSelR = LatteTextureMtl_AdjustTextureCompSel(format, compSelR); + compSelG = LatteTextureMtl_AdjustTextureCompSel(format, compSelG); + compSelB = LatteTextureMtl_AdjustTextureCompSel(format, compSelB); + compSelA = LatteTextureMtl_AdjustTextureCompSel(format, compSelA); + + MTL::TextureType textureType; + switch (dim) + { + case Latte::E_DIM::DIM_1D: + textureType = MTL::TextureType1D; + break; + case Latte::E_DIM::DIM_2D: + case Latte::E_DIM::DIM_2D_MSAA: + textureType = MTL::TextureType2D; + break; + case Latte::E_DIM::DIM_2D_ARRAY: + textureType = MTL::TextureType2DArray; + break; + case Latte::E_DIM::DIM_3D: + textureType = MTL::TextureType3D; + break; + case Latte::E_DIM::DIM_CUBEMAP: + cemu_assert_debug(this->numSlice % 6 == 0 && "cubemaps must have an array length multiple of 6"); + + textureType = MTL::TextureTypeCubeArray; + break; + default: + cemu_assert_unimplemented(); + textureType = MTL::TextureType2D; + break; + } + + uint32 baseLevel = firstMip; + uint32 levelCount = this->numMip; + uint32 baseLayer = 0; + uint32 layerCount = 1; + // TODO: check if base texture is 3D texture as well + if (textureType == MTL::TextureType3D) + { + cemu_assert_debug(firstMip == 0); + cemu_assert_debug(this->numSlice == baseTexture->depth); + } + else + { + baseLayer = firstSlice; + if (textureType == MTL::TextureTypeCubeArray || textureType == MTL::TextureType2DArray) + layerCount = this->numSlice; + } + + MTL::TextureSwizzleChannels swizzle; + swizzle.red = GetMtlTextureSwizzle(compSelR); + swizzle.green = GetMtlTextureSwizzle(compSelG); + swizzle.blue = GetMtlTextureSwizzle(compSelB); + swizzle.alpha = GetMtlTextureSwizzle(compSelA); + + // Clamp mip levels + levelCount = std::min(levelCount, m_baseTexture->maxPossibleMipLevels - baseLevel); + levelCount = std::max(levelCount, (uint32)1); + + auto pixelFormat = GetMtlPixelFormat(format, m_baseTexture->isDepth); + MTL::Texture* texture = m_baseTexture->GetTexture()->newTextureView(pixelFormat, textureType, NS::Range::Make(baseLevel, levelCount), NS::Range::Make(baseLayer, layerCount), swizzle); + + return texture; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h new file mode 100644 index 000000000..2634735ef --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +#include "Cafe/HW/Latte/Core/LatteTexture.h" + +#define RGBA_SWIZZLE 0x06880000 +#define INVALID_SWIZZLE 0xFFFFFFFF + +class LatteTextureViewMtl : public LatteTextureView +{ +public: + LatteTextureViewMtl(class MetalRenderer* mtlRenderer, class LatteTextureMtl* texture, Latte::E_DIM dim, Latte::E_GX2SURFFMT format, sint32 firstMip, sint32 mipCount, sint32 firstSlice, sint32 sliceCount); + ~LatteTextureViewMtl(); + + MTL::Texture* GetSwizzledView(uint32 gpuSamplerSwizzle); + + MTL::Texture* GetRGBAView() + { + return GetSwizzledView(RGBA_SWIZZLE); + } + +private: + class MetalRenderer* m_mtlr; + + class LatteTextureMtl* m_baseTexture; + + MTL::Texture* m_rgbaView; + struct { + uint32 key; + MTL::Texture* texture; + } m_viewCache[4] = {{INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}, {INVALID_SWIZZLE, nullptr}}; + std::unordered_map m_fallbackViewCache; + + MTL::Texture* CreateSwizzledView(uint32 gpuSamplerSwizzle); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp new file mode 100644 index 000000000..7bf295df8 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.cpp @@ -0,0 +1,511 @@ +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cemu/Logging/CemuLogging.h" +#include "HW/Latte/Core/LatteTextureLoader.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" + +std::map MTL_COLOR_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + + {Latte::E_GX2SURFFMT::R4_G4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G6_B5_UNORM, {MTL::PixelFormatB5G6R5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM, {MTL::PixelFormatBGR5A1Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM, {MTL::PixelFormatABGR4Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM, {MTL::PixelFormatA1BGR5Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_UNORM, {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_SNORM, {MTL::PixelFormatR8Snorm, MetalDataType::FLOAT, 1}}, + {Latte::E_GX2SURFFMT::R8_UINT, {MTL::PixelFormatR8Uint, MetalDataType::UINT, 1}}, + {Latte::E_GX2SURFFMT::R8_SINT, {MTL::PixelFormatR8Sint, MetalDataType::INT, 1}}, + {Latte::E_GX2SURFFMT::R8_G8_UNORM, {MTL::PixelFormatRG8Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SNORM, {MTL::PixelFormatRG8Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_UINT, {MTL::PixelFormatRG8Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_SINT, {MTL::PixelFormatRG8Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM, {MTL::PixelFormatRGBA8Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM, {MTL::PixelFormatRGBA8Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT, {MTL::PixelFormatRGBA8Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB, {MTL::PixelFormatRGBA8Unorm_sRGB, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB, {MTL::PixelFormatRGB10A2Unorm, MetalDataType::FLOAT, 4}}, // TODO: sRGB? + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UNORM, {MTL::PixelFormatBGR10A2Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::A2_B10_G10_R10_UINT, {MTL::PixelFormatRGB10A2Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R16_UNORM, {MTL::PixelFormatR16Unorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_SNORM, {MTL::PixelFormatR16Snorm, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_UINT, {MTL::PixelFormatR16Uint, MetalDataType::UINT, 2}}, + {Latte::E_GX2SURFFMT::R16_SINT, {MTL::PixelFormatR16Sint, MetalDataType::INT, 2}}, + {Latte::E_GX2SURFFMT::R16_FLOAT, {MTL::PixelFormatR16Float, MetalDataType::FLOAT, 2}}, + {Latte::E_GX2SURFFMT::R16_G16_UNORM, {MTL::PixelFormatRG16Unorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SNORM, {MTL::PixelFormatRG16Snorm, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_UINT, {MTL::PixelFormatRG16Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_SINT, {MTL::PixelFormatRG16Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_FLOAT, {MTL::PixelFormatRG16Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM, {MTL::PixelFormatRGBA16Unorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM, {MTL::PixelFormatRGBA16Snorm, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_SINT, {MTL::PixelFormatRGBA16Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT, {MTL::PixelFormatRGBA16Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R24_X8_UNORM, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R24_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::X24_G8_UINT, {MTL::PixelFormatRGBA8Uint, MetalDataType::UINT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R32_X8_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, // TODO: correct? + {Latte::E_GX2SURFFMT::X32_G8_UINT_X24, {MTL::PixelFormatRGBA16Uint, MetalDataType::UINT, 8}}, // TODO: correct? + {Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT, {MTL::PixelFormatRG11B10Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_UINT, {MTL::PixelFormatR32Uint, MetalDataType::UINT, 4}}, + {Latte::E_GX2SURFFMT::R32_SINT, {MTL::PixelFormatR32Sint, MetalDataType::INT, 4}}, + {Latte::E_GX2SURFFMT::R32_FLOAT, {MTL::PixelFormatR32Float, MetalDataType::FLOAT, 4}}, + {Latte::E_GX2SURFFMT::R32_G32_UINT, {MTL::PixelFormatRG32Uint, MetalDataType::UINT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_SINT, {MTL::PixelFormatRG32Sint, MetalDataType::INT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_FLOAT, {MTL::PixelFormatRG32Float, MetalDataType::FLOAT, 8}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT, {MTL::PixelFormatRGBA32Uint, MetalDataType::UINT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_SINT, {MTL::PixelFormatRGBA32Sint, MetalDataType::INT, 16}}, + {Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT, {MTL::PixelFormatRGBA32Float, MetalDataType::FLOAT, 16}}, + {Latte::E_GX2SURFFMT::BC1_UNORM, {MTL::PixelFormatBC1_RGBA, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC1_SRGB, {MTL::PixelFormatBC1_RGBA_sRGB, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_UNORM, {MTL::PixelFormatBC2_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC2_SRGB, {MTL::PixelFormatBC2_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_UNORM, {MTL::PixelFormatBC3_RGBA, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC3_SRGB, {MTL::PixelFormatBC3_RGBA_sRGB, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_UNORM, {MTL::PixelFormatBC4_RUnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC4_SNORM, {MTL::PixelFormatBC4_RSnorm, MetalDataType::FLOAT, 8, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_UNORM, {MTL::PixelFormatBC5_RGUnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? + {Latte::E_GX2SURFFMT::BC5_SNORM, {MTL::PixelFormatBC5_RGSnorm, MetalDataType::FLOAT, 16, {4, 4}}}, // TODO: correct? +}; + +std::map MTL_DEPTH_FORMAT_TABLE = { + {Latte::E_GX2SURFFMT::INVALID_FORMAT, {MTL::PixelFormatInvalid, MetalDataType::NONE, 0}}, + + {Latte::E_GX2SURFFMT::D24_S8_UNORM, {MTL::PixelFormatDepth24Unorm_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D24_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 4, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D32_S8_FLOAT, {MTL::PixelFormatDepth32Float_Stencil8, MetalDataType::NONE, 5, {1, 1}, true}}, + {Latte::E_GX2SURFFMT::D16_UNORM, {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2, {1, 1}}}, + {Latte::E_GX2SURFFMT::D32_FLOAT, {MTL::PixelFormatDepth32Float, MetalDataType::NONE, 4, {1, 1}}}, +}; + +// TODO: R10_G10_B10_A2_UINT and R10_G10_B10_A2_SINT +// TODO: A2_B10_G10_R10_UNORM and A2_B10_G10_R10_UINT +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support) +{ + // Texture decoders + + // Color + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_FLOAT].textureDecoder = TextureDecoder_R32_G32_B32_A32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_B32_A32_UINT].textureDecoder = TextureDecoder_R32_G32_B32_A32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_FLOAT].textureDecoder = TextureDecoder_R16_G16_B16_A16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UINT].textureDecoder = TextureDecoder_R16_G16_B16_A16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_UNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_B16_A16_SNORM].textureDecoder = TextureDecoder_R16_G16_B16_A16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SNORM].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SRGB].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_UINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_B8_A8_SINT].textureDecoder = TextureDecoder_R8_G8_B8_A8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_FLOAT].textureDecoder = TextureDecoder_R32_G32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_G32_UINT].textureDecoder = TextureDecoder_R32_G32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_UNORM].textureDecoder = TextureDecoder_R16_G16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_G16_FLOAT].textureDecoder = TextureDecoder_R16_G16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_UNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_G8_SNORM].textureDecoder = TextureDecoder_R8_G8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4_G4_UNORM_To_ABGR4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R32_UINT].textureDecoder = TextureDecoder_R32_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_FLOAT].textureDecoder = TextureDecoder_R16_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_SNORM].textureDecoder = TextureDecoder_R16_SNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R16_UINT].textureDecoder = TextureDecoder_R16_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_SNORM].textureDecoder = TextureDecoder_R8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R8_UINT].textureDecoder = TextureDecoder_R8_UINT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5_G6_B5_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R11_G11_B10_FLOAT].textureDecoder = TextureDecoder_R11_G11_B10_FLOAT::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4_G4_B4_A4_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_UNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SNORM].textureDecoder = TextureDecoder_R10_G10_B10_A2_SNORM_To_RGBA16::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R10_G10_B10_A2_SRGB].textureDecoder = TextureDecoder_R10_G10_B10_A2_UNORM::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_SRGB].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC1_UNORM].textureDecoder = TextureDecoder_BC1::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_UNORM].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC2_SRGB].textureDecoder = TextureDecoder_BC2::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_UNORM].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC3_SRGB].textureDecoder = TextureDecoder_BC3::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_UNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC4_SNORM].textureDecoder = TextureDecoder_BC4::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_UNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::BC5_SNORM].textureDecoder = TextureDecoder_BC5::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R24_X8_UNORM].textureDecoder = TextureDecoder_R24_X8::getInstance(); + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::X24_G8_UINT].textureDecoder = TextureDecoder_X24_G8_UINT::getInstance(); + + if (!support.m_supportsPacked16BitFormats) + { + // B5G6R5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G6_B5_UNORM].textureDecoder = TextureDecoder_R5G6B5_UNORM_To_RGBA8::getInstance(); + + // A1BGR5Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::A1_B5_G5_R5_UNORM].textureDecoder = TextureDecoder_A1_B5_G5_R5_UNORM_vulkan_To_RGBA8::getInstance(); + + // ABGR4Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].pixelFormat = MTL::PixelFormatRG8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].bytesPerBlock = 2; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_UNORM].textureDecoder = TextureDecoder_R4G4_UNORM_To_RG8::getInstance(); + + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].bytesPerBlock = 4; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R4_G4_B4_A4_UNORM].textureDecoder = TextureDecoder_R4G4B4A4_UNORM_To_RGBA8::getInstance(); + + // BGR5A1Unorm + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].pixelFormat = MTL::PixelFormatRGBA8Unorm; + MTL_COLOR_FORMAT_TABLE[Latte::E_GX2SURFFMT::R5_G5_B5_A1_UNORM].textureDecoder = TextureDecoder_R5_G5_B5_A1_UNORM_swappedRB_To_RGBA8::getInstance(); + } + + // Depth + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_FLOAT].textureDecoder = TextureDecoder_NullData64::getInstance(); // TODO: why? + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_FLOAT].textureDecoder = TextureDecoder_R32_FLOAT::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D16_UNORM].textureDecoder = TextureDecoder_R16_UNORM::getInstance(); + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D32_S8_FLOAT].textureDecoder = TextureDecoder_D32_S8_UINT_X24::getInstance(); + + if (!support.m_supportsDepth24Unorm_Stencil8) + { + // Depth24Unorm_Stencil8 + MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].pixelFormat = MTL::PixelFormatDepth32Float_Stencil8; + // TODO: implement the decoder + //MTL_DEPTH_FORMAT_TABLE[Latte::E_GX2SURFFMT::D24_S8_UNORM].textureDecoder = TextureDecoder_D24_S8_To_D32_S8::getInstance(); + } +} + +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth) +{ + if (isDepth) + { + auto it = MTL_DEPTH_FORMAT_TABLE.find(format); + if (it == MTL_DEPTH_FORMAT_TABLE.end()) + return {MTL::PixelFormatDepth16Unorm, MetalDataType::NONE, 2}; // Fallback + else + return it->second; + } + else + { + auto it = MTL_COLOR_FORMAT_TABLE.find(format); + if (it == MTL_COLOR_FORMAT_TABLE.end()) + return {MTL::PixelFormatR8Unorm, MetalDataType::FLOAT, 1}; // Fallback + else + return it->second; + } +} + +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth) +{ + auto pixelFormat = GetMtlPixelFormatInfo(format, isDepth).pixelFormat; + if (pixelFormat == MTL::PixelFormatInvalid) + cemuLog_log(LogType::Force, "invalid pixel format 0x{:x}, is depth: {}\n", format, isDepth); + + return pixelFormat; +} + +inline uint32 CeilDivide(uint32 a, uint32 b) { + return (a + b - 1) / b; +} + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width) +{ + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); + + return CeilDivide(width, formatInfo.blockTexelSize.x) * formatInfo.bytesPerBlock; +} + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow) +{ + const auto& formatInfo = GetMtlPixelFormatInfo(format, isDepth); + + return CeilDivide(height, formatInfo.blockTexelSize.y) * bytesPerRow; +} + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode) +{ + switch (primitiveMode) + { + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS: + return MTL::PrimitiveTypePoint; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINES: + return MTL::PrimitiveTypeLine; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP: + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_LOOP: + return MTL::PrimitiveTypeLineStrip; // line loops are emulated as line strips with an extra connecting strip at the end + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::LINE_STRIP_ADJACENT: // Tropical Freeze level 3-6 + cemuLog_logOnce(LogType::Force, "Metal doesn't support line strip adjacent primitive, using line strip instead"); + return MTL::PrimitiveTypeLineStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLES: + return MTL::PrimitiveTypeTriangle; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_FAN: + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::TRIANGLE_STRIP: + return MTL::PrimitiveTypeTriangleStrip; + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUADS: + return MTL::PrimitiveTypeTriangle; // quads are emulated as 2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::QUAD_STRIP: + return MTL::PrimitiveTypeTriangle; // quad strips are emulated as (count-2)/2 triangles + case Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS: + return MTL::PrimitiveTypeTriangle; // rects are emulated as 2 triangles + default: + cemuLog_log(LogType::Force, "Unsupported primitive mode {}", primitiveMode); + cemu_assert_debug(false); + return MTL::PrimitiveTypeTriangle; + } +} + +MTL::VertexFormat GetMtlVertexFormat(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return MTL::VertexFormatUInt4; + case FMT_32_32_32_FLOAT: + return MTL::VertexFormatUInt3; + case FMT_32_32_FLOAT: + return MTL::VertexFormatUInt2; + case FMT_32_FLOAT: + return MTL::VertexFormatUInt; + case FMT_8_8_8_8: + return MTL::VertexFormatUChar4; + case FMT_8_8_8: + return MTL::VertexFormatUChar3; + case FMT_8_8: + return MTL::VertexFormatUChar2; + case FMT_8: + return MTL::VertexFormatUChar; + case FMT_32_32_32_32: + return MTL::VertexFormatUInt4; + case FMT_32_32_32: + return MTL::VertexFormatUInt3; + case FMT_32_32: + return MTL::VertexFormatUInt2; + case FMT_32: + return MTL::VertexFormatUInt; + case FMT_16_16_16_16: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16: + return MTL::VertexFormatUShort3; + case FMT_16_16: + return MTL::VertexFormatUShort2; + case FMT_16: + return MTL::VertexFormatUShort; + case FMT_16_16_16_16_FLOAT: + return MTL::VertexFormatUShort4; // verified to match OpenGL + case FMT_16_16_16_FLOAT: + return MTL::VertexFormatUShort3; + case FMT_16_16_FLOAT: + return MTL::VertexFormatUShort2; + case FMT_16_FLOAT: + return MTL::VertexFormatUShort; + case FMT_2_10_10_10: + return MTL::VertexFormatUInt; // verified to match OpenGL + default: + cemuLog_log(LogType::Force, "unsupported vertex format {}", (uint32)format); + assert_dbg(); + return MTL::VertexFormatInvalid; + } +} + +uint32 GetMtlVertexFormatSize(uint8 format) +{ + switch (format) + { + case FMT_32_32_32_32_FLOAT: + return 16; + case FMT_32_32_32_FLOAT: + return 12; + case FMT_32_32_FLOAT: + return 8; + case FMT_32_FLOAT: + return 4; + case FMT_8_8_8_8: + return 4; + case FMT_8_8_8: + return 3; + case FMT_8_8: + return 2; + case FMT_8: + return 1; + case FMT_32_32_32_32: + return 16; + case FMT_32_32_32: + return 12; + case FMT_32_32: + return 8; + case FMT_32: + return 4; + case FMT_16_16_16_16: + return 8; + case FMT_16_16_16: + return 6; + case FMT_16_16: + return 4; + case FMT_16: + return 2; + case FMT_16_16_16_16_FLOAT: + return 8; + case FMT_16_16_16_FLOAT: + return 6; + case FMT_16_16_FLOAT: + return 4; + case FMT_16_FLOAT: + return 2; + case FMT_2_10_10_10: + return 4; + default: + return 0; + } +} + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType) +{ + switch (indexType) + { + case Renderer::INDEX_TYPE::U16: + return MTL::IndexTypeUInt16; + case Renderer::INDEX_TYPE::U32: + return MTL::IndexTypeUInt32; + default: + cemu_assert_suspicious(); + return MTL::IndexTypeUInt32; + } +} + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc) +{ + switch (combineFunc) + { + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_PLUS_SRC: + return MTL::BlendOperationAdd; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::SRC_MINUS_DST: + return MTL::BlendOperationSubtract; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MIN_DST_SRC: + return MTL::BlendOperationMin; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::MAX_DST_SRC: + return MTL::BlendOperationMax; + case Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC::DST_MINUS_SRC: + return MTL::BlendOperationReverseSubtract; + default: + cemu_assert_suspicious(); + return MTL::BlendOperationAdd; + } +} + +const MTL::BlendFactor MTL_BLEND_FACTORS[] = +{ + /* 0x00 */ MTL::BlendFactorZero, + /* 0x01 */ MTL::BlendFactorOne, + /* 0x02 */ MTL::BlendFactorSourceColor, + /* 0x03 */ MTL::BlendFactorOneMinusSourceColor, + /* 0x04 */ MTL::BlendFactorSourceAlpha, + /* 0x05 */ MTL::BlendFactorOneMinusSourceAlpha, + /* 0x06 */ MTL::BlendFactorDestinationAlpha, + /* 0x07 */ MTL::BlendFactorOneMinusDestinationAlpha, + /* 0x08 */ MTL::BlendFactorDestinationColor, + /* 0x09 */ MTL::BlendFactorOneMinusDestinationColor, + /* 0x0A */ MTL::BlendFactorSourceAlphaSaturated, + /* 0x0B */ MTL::BlendFactorZero, // TODO + /* 0x0C */ MTL::BlendFactorZero, // TODO + /* 0x0D */ MTL::BlendFactorBlendColor, + /* 0x0E */ MTL::BlendFactorOneMinusBlendColor, + /* 0x0F */ MTL::BlendFactorSource1Color, + /* 0x10 */ MTL::BlendFactorOneMinusSource1Color, + /* 0x11 */ MTL::BlendFactorSource1Alpha, + /* 0x12 */ MTL::BlendFactorOneMinusSource1Alpha, + /* 0x13 */ MTL::BlendFactorBlendAlpha, + /* 0x14 */ MTL::BlendFactorOneMinusBlendAlpha +}; + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor) +{ + cemu_assert_debug((uint32)factor < std::size(MTL_BLEND_FACTORS)); + return MTL_BLEND_FACTORS[(uint32)factor]; +} + +const MTL::CompareFunction MTL_COMPARE_FUNCTIONS[8] = +{ + MTL::CompareFunctionNever, + MTL::CompareFunctionLess, + MTL::CompareFunctionEqual, + MTL::CompareFunctionLessEqual, + MTL::CompareFunctionGreater, + MTL::CompareFunctionNotEqual, + MTL::CompareFunctionGreaterEqual, + MTL::CompareFunctionAlways +}; + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func) +{ + cemu_assert_debug((uint32)func < std::size(MTL_COMPARE_FUNCTIONS)); + return MTL_COMPARE_FUNCTIONS[(uint32)func]; +} + +// TODO: clamp to border color? (should be fine though) +const MTL::SamplerAddressMode MTL_SAMPLER_ADDRESS_MODES[] = { + MTL::SamplerAddressModeRepeat, // WRAP + MTL::SamplerAddressModeMirrorRepeat, // MIRROR + MTL::SamplerAddressModeClampToEdge, // CLAMP_LAST_TEXEL + MTL::SamplerAddressModeMirrorClampToEdge, // MIRROR_ONCE_LAST_TEXEL + MTL::SamplerAddressModeClampToEdge, // unsupported HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // unsupported MIRROR_ONCE_HALF_BORDER + MTL::SamplerAddressModeClampToBorderColor, // CLAMP_BORDER + MTL::SamplerAddressModeClampToBorderColor // MIRROR_ONCE_BORDER +}; + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp) +{ + cemu_assert_debug((uint32)clamp < std::size(MTL_SAMPLER_ADDRESS_MODES)); + return MTL_SAMPLER_ADDRESS_MODES[(uint32)clamp]; +} + +const MTL::TextureSwizzle MTL_TEXTURE_SWIZZLES[] = { + MTL::TextureSwizzleRed, + MTL::TextureSwizzleGreen, + MTL::TextureSwizzleBlue, + MTL::TextureSwizzleAlpha, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleOne, + MTL::TextureSwizzleZero, + MTL::TextureSwizzleZero +}; + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle) +{ + cemu_assert_debug(swizzle < std::size(MTL_TEXTURE_SWIZZLES)); + return MTL_TEXTURE_SWIZZLES[swizzle]; +} + +const MTL::StencilOperation MTL_STENCIL_OPERATIONS[8] = { + MTL::StencilOperationKeep, + MTL::StencilOperationZero, + MTL::StencilOperationReplace, + MTL::StencilOperationIncrementClamp, + MTL::StencilOperationDecrementClamp, + MTL::StencilOperationInvert, + MTL::StencilOperationIncrementWrap, + MTL::StencilOperationDecrementWrap +}; + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action) +{ + cemu_assert_debug((uint32)action < std::size(MTL_STENCIL_OPERATIONS)); + return MTL_STENCIL_OPERATIONS[(uint32)action]; +} + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask) +{ + MTL::ColorWriteMask mtlMask = MTL::ColorWriteMaskNone; + if (mask & 0x1) mtlMask |= MTL::ColorWriteMaskRed; + if (mask & 0x2) mtlMask |= MTL::ColorWriteMaskGreen; + if (mask & 0x4) mtlMask |= MTL::ColorWriteMaskBlue; + if (mask & 0x8) mtlMask |= MTL::ColorWriteMaskAlpha; + + return mtlMask; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h new file mode 100644 index 000000000..ef25ca5d5 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h @@ -0,0 +1,86 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +//#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/precompiled.h" +#include "HW/Latte/Core/LatteTextureLoader.h" + +struct Uvec2 { + uint32 x; + uint32 y; +}; + +enum class MetalDataType +{ + NONE, + INT, + UINT, + FLOAT, +}; + +struct MetalPixelFormatInfo { + MTL::PixelFormat pixelFormat; + MetalDataType dataType; + size_t bytesPerBlock; + Uvec2 blockTexelSize = {1, 1}; + bool hasStencil = false; + TextureDecoder* textureDecoder = nullptr; +}; + +void CheckForPixelFormatSupport(const MetalPixelFormatSupport& support); + +const MetalPixelFormatInfo GetMtlPixelFormatInfo(Latte::E_GX2SURFFMT format, bool isDepth); + +MTL::PixelFormat GetMtlPixelFormat(Latte::E_GX2SURFFMT format, bool isDepth); + +inline MetalDataType GetColorBufferDataType(const uint32 index, const LatteContextRegister& lcr) +{ + auto format = LatteMRT::GetColorBufferFormat(index, lcr); + return GetMtlPixelFormatInfo(format, false).dataType; +} + +inline const char* GetDataTypeStr(MetalDataType dataType) +{ + switch (dataType) + { + case MetalDataType::INT: + return "int4"; + case MetalDataType::UINT: + return "uint4"; + case MetalDataType::FLOAT: + return "float4"; + default: + cemu_assert_suspicious(); + return "INVALID"; + } +} + +size_t GetMtlTextureBytesPerRow(Latte::E_GX2SURFFMT format, bool isDepth, uint32 width); + +size_t GetMtlTextureBytesPerImage(Latte::E_GX2SURFFMT format, bool isDepth, uint32 height, size_t bytesPerRow); + +MTL::PrimitiveType GetMtlPrimitiveType(LattePrimitiveMode primitiveMode); + +MTL::VertexFormat GetMtlVertexFormat(uint8 format); + +uint32 GetMtlVertexFormatSize(uint8 format); + +MTL::IndexType GetMtlIndexType(Renderer::INDEX_TYPE indexType); + +MTL::BlendOperation GetMtlBlendOp(Latte::LATTE_CB_BLENDN_CONTROL::E_COMBINEFUNC combineFunc); + +MTL::BlendFactor GetMtlBlendFactor(Latte::LATTE_CB_BLENDN_CONTROL::E_BLENDFACTOR factor); + +MTL::CompareFunction GetMtlCompareFunc(Latte::E_COMPAREFUNC func); + +MTL::SamplerAddressMode GetMtlSamplerAddressMode(Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_CLAMP clamp); + +MTL::TextureSwizzle GetMtlTextureSwizzle(uint32 swizzle); + +MTL::StencilOperation GetMtlStencilOp(Latte::LATTE_DB_DEPTH_CONTROL::E_STENCILACTION action); + +MTL::ColorWriteMask GetMtlColorWriteMask(uint8 mask); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp new file mode 100644 index 000000000..88a2dface --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp @@ -0,0 +1,48 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" + +MetalAttachmentsInfo::MetalAttachmentsInfo(class CachedFBOMtl* fbo) +{ + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + const auto& colorBuffer = fbo->colorBuffer[i]; + auto texture = static_cast(colorBuffer.texture); + if (!texture) + continue; + + colorFormats[i] = texture->format; + } + + // Depth stencil attachment + if (fbo->depthBuffer.texture) + { + auto texture = static_cast(fbo->depthBuffer.texture); + depthFormat = texture->format; + hasStencil = fbo->depthBuffer.hasStencil; + } +} + +MetalAttachmentsInfo::MetalAttachmentsInfo(const LatteContextRegister& lcr, const LatteDecompilerShader* pixelShader) +{ + uint8 cbMask = LatteMRT::GetActiveColorBufferMask(pixelShader, lcr); + bool dbMask = LatteMRT::GetActiveDepthBufferMask(lcr); + + // Color attachments + for (int i = 0; i < 8; ++i) + { + if ((cbMask & (1 << i)) == 0) + continue; + + colorFormats[i] = LatteMRT::GetColorBufferFormat(i, lcr); + } + + // Depth stencil attachment + if (dbMask) + { + Latte::E_GX2SURFFMT format = LatteMRT::GetDepthBufferFormat(lcr); + depthFormat = format; + hasStencil = GetMtlPixelFormatInfo(format, true).hasStencil; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h new file mode 100644 index 000000000..c8ebe7c11 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h @@ -0,0 +1,15 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +class MetalAttachmentsInfo +{ +public: + MetalAttachmentsInfo() = default; + MetalAttachmentsInfo(class CachedFBOMtl* fbo); + MetalAttachmentsInfo(const LatteContextRegister& lcr, const class LatteDecompilerShader* pixelShader); + + Latte::E_GX2SURFFMT colorFormats[LATTE_NUM_COLOR_TARGET] = {Latte::E_GX2SURFFMT::INVALID_FORMAT}; + Latte::E_GX2SURFFMT depthFormat = Latte::E_GX2SURFFMT::INVALID_FORMAT; + bool hasStencil = false; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp new file mode 100644 index 000000000..05d169b30 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp @@ -0,0 +1,217 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" + +MetalBufferChunkedHeap::~MetalBufferChunkedHeap() +{ + for (auto& chunk : m_chunkBuffers) + chunk->release(); +} + +uint32 MetalBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) +{ + size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); + MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); + cemu_assert_debug(buffer); + cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); + m_chunkBuffers.emplace_back(buffer); + + return allocationSize; +} + +void MetalSynchronizedRingAllocator::addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset) +{ + auto commandBuffer = m_mtlr->GetCurrentCommandBuffer(); + if (commandBuffer == buffer.lastSyncpointCommandBuffer) + return; + buffer.lastSyncpointCommandBuffer = commandBuffer; + buffer.queue_syncPoints.emplace(commandBuffer, offset); +} + +void MetalSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc) +{ + // calculate buffer size, should be a multiple of bufferAllocSize that is at least as large as sizeRequiredForAlloc + uint32 bufferAllocSize = m_minimumBufferAllocSize; + while (bufferAllocSize < sizeRequiredForAlloc) + bufferAllocSize += m_minimumBufferAllocSize; + + AllocatorBuffer_t newBuffer{}; + newBuffer.writeIndex = 0; + newBuffer.basePtr = nullptr; + newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, m_options); + newBuffer.basePtr = (uint8*)newBuffer.mtlBuffer->contents(); + newBuffer.size = bufferAllocSize; + newBuffer.index = (uint32)m_buffers.size(); + m_buffers.push_back(newBuffer); +} + +MetalSynchronizedRingAllocator::AllocatorReservation_t MetalSynchronizedRingAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + if (alignment < 128) + alignment = 128; + size = (size + 127) & ~127; + + for (auto& itr : m_buffers) + { + // align pointer + uint32 alignmentPadding = (alignment - (itr.writeIndex % alignment)) % alignment; + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = 0xFFFFFFFF; + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = 0xFFFFFFFF; + uint32 spaceNeeded = alignmentPadding + size; + if (spaceNeeded > distanceToSyncPoint) + continue; // not enough space in current buffer + if ((itr.writeIndex + spaceNeeded) > itr.size) + { + // wrap-around + spaceNeeded = size; + alignmentPadding = 0; + // check if there is enough space in current buffer after wrap-around + if (!itr.queue_syncPoints.empty()) + { + distanceToSyncPoint = itr.queue_syncPoints.front().offset - 0; + if (spaceNeeded > distanceToSyncPoint) + continue; + } + else if (spaceNeeded > itr.size) + continue; + itr.writeIndex = 0; + } + addUploadBufferSyncPoint(itr, itr.writeIndex); + itr.writeIndex += alignmentPadding; + uint32 offset = itr.writeIndex; + itr.writeIndex += size; + itr.cleanupCounter = 0; + MetalSynchronizedRingAllocator::AllocatorReservation_t res; + res.mtlBuffer = itr.mtlBuffer; + res.memPtr = itr.basePtr + offset; + res.bufferOffset = offset; + res.size = size; + res.bufferIndex = itr.index; + + return res; + } + + // allocate new buffer + allocateAdditionalUploadBuffer(size); + + return AllocateBufferMemory(size, alignment); +} + +void MetalSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) +{ + if (RequiresFlush()) + { + uploadReservation.mtlBuffer->didModifyRange(NS::Range(uploadReservation.bufferOffset, uploadReservation.size)); + } +} + +void MetalSynchronizedRingAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + for (auto& itr : m_buffers) + { + while (!itr.queue_syncPoints.empty() && latestFinishedCommandBuffer == itr.queue_syncPoints.front().commandBuffer) + { + itr.queue_syncPoints.pop(); + } + if (itr.queue_syncPoints.empty()) + itr.cleanupCounter++; + } + + // check if last buffer is available for deletion + if (m_buffers.size() >= 2) + { + auto& lastBuffer = m_buffers.back(); + if (lastBuffer.cleanupCounter >= 1000) + { + // release buffer + lastBuffer.mtlBuffer->release(); + m_buffers.pop_back(); + } + } +} + +MTL::Buffer* MetalSynchronizedRingAllocator::GetBufferByIndex(uint32 index) const +{ + return m_buffers[index].mtlBuffer; +} + +void MetalSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + numBuffers = (uint32)m_buffers.size(); + totalBufferSize = 0; + freeBufferSize = 0; + for (auto& itr : m_buffers) + { + totalBufferSize += itr.size; + // calculate free space in buffer + uint32 distanceToSyncPoint; + if (!itr.queue_syncPoints.empty()) + { + if (itr.queue_syncPoints.front().offset < itr.writeIndex) + distanceToSyncPoint = (itr.size - itr.writeIndex) + itr.queue_syncPoints.front().offset; // size with wrap-around + else + distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; + } + else + distanceToSyncPoint = itr.size; + freeBufferSize += distanceToSyncPoint; + } +} + +/* MetalSynchronizedHeapAllocator */ + +MetalSynchronizedHeapAllocator::AllocatorReservation* MetalSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + CHAddr addr = m_chunkedHeap.alloc(size, alignment); + m_activeAllocations.emplace_back(addr); + AllocatorReservation* res = m_poolAllocatorReservation.allocObj(); + res->bufferIndex = addr.chunkIndex; + res->bufferOffset = addr.offset; + res->size = size; + res->mtlBuffer = m_chunkedHeap.GetBufferByIndex(addr.chunkIndex); + res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset; + + return res; +} + +void MetalSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation) +{ + // put the allocation on a delayed release queue for the current command buffer + MTL::CommandBuffer* currentCommandBuffer = m_mtlr->GetCurrentCommandBuffer(); + auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; }); + cemu_assert_debug(it != m_activeAllocations.end()); + m_releaseQueue[currentCommandBuffer].emplace_back(it->allocation); + m_activeAllocations.erase(it); + m_poolAllocatorReservation.freeObj(uploadReservation); +} + +void MetalSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) +{ + if (m_chunkedHeap.RequiresFlush()) + { + uploadReservation->mtlBuffer->didModifyRange(NS::Range(uploadReservation->bufferOffset, uploadReservation->size)); + } +} + +void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer) +{ + auto it = m_releaseQueue.find(latestFinishedCommandBuffer); + if (it == m_releaseQueue.end()) + return; + + // release allocations + for (auto& addr : it->second) + m_chunkedHeap.free(addr); + m_releaseQueue.erase(it); +} + +void MetalSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h new file mode 100644 index 000000000..2a62de19c --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -0,0 +1,163 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLResource.hpp" +#include "util/ChunkedHeap/ChunkedHeap.h" +#include "util/helpers/MemoryPool.h" + +#include + +inline MTL::ResourceOptions GetResourceOptions(MTL::ResourceOptions options) +{ + if (options & MTL::ResourceStorageModeShared || options & MTL::ResourceStorageModeManaged) + options |= MTL::ResourceCPUCacheModeWriteCombined; + + return options; +} + +class MetalBufferChunkedHeap : private ChunkedHeap<> +{ + public: + MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + ~MetalBufferChunkedHeap(); + + using ChunkedHeap::alloc; + using ChunkedHeap::free; + + uint8* GetChunkPtr(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return nullptr; + + return (uint8*)m_chunkBuffers[index]->contents(); + } + + MTL::Buffer* GetBufferByIndex(uint32 index) const + { + cemu_assert_debug(index < m_chunkBuffers.size()); + + return m_chunkBuffers[index]; + } + + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const + { + numBuffers = m_chunkBuffers.size(); + totalBufferSize = m_numHeapBytes; + freeBufferSize = m_numHeapBytes - m_numAllocatedBytes; + } + + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + + const class MetalRenderer* m_mtlr; + + MTL::ResourceOptions m_options; + size_t m_minimumBufferAllocationSize; + + std::vector m_chunkBuffers; +}; + +// a circular ring-buffer which tracks and releases memory per command-buffer +class MetalSynchronizedRingAllocator +{ +public: + MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + MetalSynchronizedRingAllocator(const MetalSynchronizedRingAllocator&) = delete; // disallow copy + + struct BufferSyncPoint_t + { + // todo - modularize sync point + MTL::CommandBuffer* commandBuffer; + uint32 offset; + + BufferSyncPoint_t(MTL::CommandBuffer* _commandBuffer, uint32 _offset) : commandBuffer(_commandBuffer), offset(_offset) {}; + }; + + struct AllocatorBuffer_t + { + MTL::Buffer* mtlBuffer; + uint8* basePtr; + uint32 size; + uint32 writeIndex; + std::queue queue_syncPoints; + MTL::CommandBuffer* lastSyncpointCommandBuffer{ nullptr }; + uint32 index; + uint32 cleanupCounter{ 0 }; // increased by one every time CleanupBuffer() is called if there is no sync point. If it reaches 300 then the buffer is released + }; + + struct AllocatorReservation_t + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation_t AllocateBufferMemory(uint32 size, uint32 alignment); + void FlushReservation(AllocatorReservation_t& uploadReservation); + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + MTL::Buffer* GetBufferByIndex(uint32 index) const; + + bool RequiresFlush() const + { + return m_options & MTL::ResourceStorageModeManaged; + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + +private: + void allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc); + void addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset); + + const class MetalRenderer* m_mtlr; + + MTL::ResourceOptions m_options; + const uint32 m_minimumBufferAllocSize; + + std::vector m_buffers; +}; + +// heap style allocator with released memory being freed after the current command buffer finishes +class MetalSynchronizedHeapAllocator +{ + struct TrackedAllocation + { + TrackedAllocation(CHAddr allocation) : allocation(allocation) {}; + CHAddr allocation; + }; + + public: + MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, options, minimumBufferAllocSize) {} + MetalSynchronizedHeapAllocator(const MetalSynchronizedHeapAllocator&) = delete; // disallow copy + + struct AllocatorReservation + { + MTL::Buffer* mtlBuffer; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment); + void FreeReservation(AllocatorReservation* uploadReservation); + void FlushReservation(AllocatorReservation* uploadReservation); + + void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer); + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + private: + const class MetalRenderer* m_mtlr; + MetalBufferChunkedHeap m_chunkedHeap; + // allocations + std::vector m_activeAllocations; + MemoryPool m_poolAllocatorReservation{32}; + // release queue + std::unordered_map> m_releaseQueue; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h new file mode 100644 index 000000000..a03e7cae2 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -0,0 +1,179 @@ +#pragma once + +#include +#include + +#include "Cafe/HW/Latte/Core/LatteConst.h" + +struct MetalPixelFormatSupport +{ + bool m_supportsR8Unorm_sRGB; + bool m_supportsRG8Unorm_sRGB; + bool m_supportsPacked16BitFormats; + bool m_supportsDepth24Unorm_Stencil8; + + MetalPixelFormatSupport() = default; + MetalPixelFormatSupport(MTL::Device* device) + { + m_supportsR8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsRG8Unorm_sRGB = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsPacked16BitFormats = device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsDepth24Unorm_Stencil8 = device->depth24Stencil8PixelFormatSupported(); + } +}; + +// TODO: don't define a new struct for this +struct MetalQueryRange +{ + uint32 begin; + uint32 end; +}; + +#define MAX_MTL_BUFFERS 31 +// Buffer indices 28-30 are reserved for the helper shaders +#define MTL_RESERVED_BUFFERS 3 +#define MAX_MTL_VERTEX_BUFFERS (MAX_MTL_BUFFERS - MTL_RESERVED_BUFFERS) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_VERTEX_BUFFERS - index - 1) + +#define MAX_MTL_TEXTURES 31 +#define MAX_MTL_SAMPLERS 16 + +#define GET_HELPER_BUFFER_BINDING(index) (28 + index) +#define GET_HELPER_TEXTURE_BINDING(index) (29 + index) +#define GET_HELPER_SAMPLER_BINDING(index) (14 + index) + +constexpr uint32 INVALID_UINT32 = std::numeric_limits::max(); +constexpr size_t INVALID_OFFSET = std::numeric_limits::max(); + +inline size_t Align(size_t size, size_t alignment) +{ + return (size + alignment - 1) & ~(alignment - 1); +} + +//inline std::string GetColorAttachmentTypeStr(uint32 index) +//{ +// return "COLOR_ATTACHMENT" + std::to_string(index) + "_TYPE"; +//} + +// Cast from const char* to NS::String* +inline NS::String* ToNSString(const char* str) +{ + return NS::String::string(str, NS::ASCIIStringEncoding); +} + +// Cast from std::string to NS::String* +inline NS::String* ToNSString(const std::string& str) +{ + return ToNSString(str.c_str()); +} + +// Cast from const char* to NS::URL* +inline NS::URL* ToNSURL(const char* str) +{ + return NS::URL::fileURLWithPath(ToNSString(str)); +} + +// Cast from std::string to NS::URL* +inline NS::URL* ToNSURL(const std::string& str) +{ + return ToNSURL(str.c_str()); +} + +inline NS::String* GetLabel(const std::string& label, const void* identifier) +{ + return ToNSString(label + " (" + std::to_string(reinterpret_cast(identifier)) + ")"); +} + +constexpr MTL::RenderStages ALL_MTL_RENDER_STAGES = MTL::RenderStageVertex | MTL::RenderStageObject | MTL::RenderStageMesh | MTL::RenderStageFragment; + +inline bool IsValidDepthTextureType(Latte::E_DIM dim) +{ + return (dim == Latte::E_DIM::DIM_2D || dim == Latte::E_DIM::DIM_2D_MSAA || dim == Latte::E_DIM::DIM_2D_ARRAY || dim == Latte::E_DIM::DIM_2D_ARRAY_MSAA || dim == Latte::E_DIM::DIM_CUBEMAP); +} + +inline bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer) +{ + auto status = commandBuffer->status(); + return (status == MTL::CommandBufferStatusCompleted || status == MTL::CommandBufferStatusError); +} + +inline bool FormatIsRenderable(Latte::E_GX2SURFFMT format) +{ + return !Latte::IsCompressedFormat(format); +} + +template +inline bool executeCommand(fmt::format_string fmt, T&&... args) { + std::string command = fmt::format(fmt, std::forward(args)...); + int res = system(command.c_str()); + if (res != 0) + { + cemuLog_log(LogType::Force, "command \"{}\" failed with exit code {}", command, res); + return false; + } + + return true; +} + +class MemoryMappedFile +{ +public: + MemoryMappedFile(const std::string& filePath) + { + // Open the file + m_fd = open(filePath.c_str(), O_RDONLY); + if (m_fd == -1) { + cemuLog_log(LogType::Force, "failed to open file: {}", filePath); + return; + } + + // Get the file size + // Use a loop to handle the case where the file size is 0 (more of a safety net) + struct stat fileStat; + while (true) + { + if (fstat(m_fd, &fileStat) == -1) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to get file size: {}", filePath); + return; + } + m_fileSize = fileStat.st_size; + + if (m_fileSize == 0) + { + cemuLog_logOnce(LogType::Force, "file size is 0: {}", filePath); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + break; + } + + // Memory map the file + m_data = mmap(nullptr, m_fileSize, PROT_READ, MAP_PRIVATE, m_fd, 0); + if (m_data == MAP_FAILED) + { + close(m_fd); + cemuLog_log(LogType::Force, "failed to memory map file: {}", filePath); + return; + } + } + + ~MemoryMappedFile() + { + if (m_data && m_data != MAP_FAILED) + munmap(m_data, m_fileSize); + + if (m_fd != -1) + close(m_fd); + } + + uint8* data() const { return static_cast(m_data); } + size_t size() const { return m_fileSize; } + +private: + int m_fd = -1; + void* m_data = nullptr; + size_t m_fileSize = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp new file mode 100644 index 000000000..13cd9dd67 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCppImpl.cpp @@ -0,0 +1,6 @@ +#define NS_PRIVATE_IMPLEMENTATION +#define CA_PRIVATE_IMPLEMENTATION +#define MTL_PRIVATE_IMPLEMENTATION +#include +#include +#include diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp new file mode 100644 index 000000000..a1e4005b5 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp @@ -0,0 +1,123 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/ISA/RegDefines.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Metal/MTLDepthStencil.hpp" + +MetalDepthStencilCache::~MetalDepthStencilCache() +{ + for (auto& pair : m_depthStencilCache) + { + pair.second->release(); + } + m_depthStencilCache.clear(); +} + +MTL::DepthStencilState* MetalDepthStencilCache::GetDepthStencilState(const LatteContextRegister& lcr) +{ + uint64 stateHash = CalculateDepthStencilHash(lcr); + auto& depthStencilState = m_depthStencilCache[stateHash]; + if (depthStencilState) + return depthStencilState; + + // Depth stencil state + bool depthEnable = lcr.DB_DEPTH_CONTROL.get_Z_ENABLE(); + auto depthFunc = lcr.DB_DEPTH_CONTROL.get_Z_FUNC(); + bool depthWriteEnable = lcr.DB_DEPTH_CONTROL.get_Z_WRITE_ENABLE(); + + MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init(); + if (depthEnable) + { + desc->setDepthWriteEnabled(depthWriteEnable); + desc->setDepthCompareFunction(GetMtlCompareFunc(depthFunc)); + } + + // Stencil state + bool stencilEnable = lcr.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + if (stencilEnable) + { + // get stencil control parameters + bool backStencilEnable = lcr.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + auto frontStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_F(); + auto frontStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_F(); + auto frontStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_F(); + auto frontStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_F(); + auto backStencilFunc = lcr.DB_DEPTH_CONTROL.get_STENCIL_FUNC_B(); + auto backStencilZPass = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZPASS_B(); + auto backStencilZFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_ZFAIL_B(); + auto backStencilFail = lcr.DB_DEPTH_CONTROL.get_STENCIL_FAIL_B(); + // get stencil control parameters + uint32 stencilCompareMaskFront = lcr.DB_STENCILREFMASK.get_STENCILMASK_F(); + uint32 stencilWriteMaskFront = lcr.DB_STENCILREFMASK.get_STENCILWRITEMASK_F(); + uint32 stencilCompareMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILMASK_B(); + uint32 stencilWriteMaskBack = lcr.DB_STENCILREFMASK_BF.get_STENCILWRITEMASK_B(); + + MTL::StencilDescriptor* frontStencil = MTL::StencilDescriptor::alloc()->init(); + frontStencil->setReadMask(stencilCompareMaskFront); + frontStencil->setWriteMask(stencilWriteMaskFront); + frontStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + frontStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + frontStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + frontStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + desc->setFrontFaceStencil(frontStencil); + + MTL::StencilDescriptor* backStencil = MTL::StencilDescriptor::alloc()->init(); + if (backStencilEnable) + { + backStencil->setReadMask(stencilCompareMaskBack); + backStencil->setWriteMask(stencilWriteMaskBack); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(backStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(backStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(backStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(backStencilZPass)); + } + else + { + backStencil->setReadMask(stencilCompareMaskFront); + backStencil->setWriteMask(stencilWriteMaskFront); + backStencil->setStencilCompareFunction(GetMtlCompareFunc(frontStencilFunc)); + backStencil->setDepthFailureOperation(GetMtlStencilOp(frontStencilZFail)); + backStencil->setStencilFailureOperation(GetMtlStencilOp(frontStencilFail)); + backStencil->setDepthStencilPassOperation(GetMtlStencilOp(frontStencilZPass)); + } + desc->setBackFaceStencil(backStencil); + + frontStencil->release(); + backStencil->release(); + } + + depthStencilState = m_mtlr->GetDevice()->newDepthStencilState(desc); + desc->release(); + + return depthStencilState; +} + +uint64 MetalDepthStencilCache::CalculateDepthStencilHash(const LatteContextRegister& lcr) +{ + uint32* ctxRegister = lcr.GetRawView(); + + // Hash + uint64 stateHash = 0; + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; + bool stencilTestEnable = depthControl & 1; + if (stencilTestEnable) + { + stateHash += ctxRegister[mmDB_STENCILREFMASK]; + stateHash = std::rotl(stateHash, 17); + if(depthControl & (1<<7)) // back stencil enable + { + stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; + stateHash = std::rotl(stateHash, 13); + } + } + else + { + // zero out stencil related bits (8-31) + depthControl &= 0xFF; + } + + stateHash = std::rotl(stateHash, 17); + stateHash += depthControl; + + return stateHash; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h new file mode 100644 index 000000000..4ce05c286 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +class MetalDepthStencilCache +{ +public: + MetalDepthStencilCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalDepthStencilCache(); + + MTL::DepthStencilState* GetDepthStencilState(const LatteContextRegister& lcr); + +private: + class MetalRenderer* m_mtlr; + + std::map m_depthStencilCache; + + uint64 CalculateDepthStencilHash(const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h new file mode 100644 index 000000000..d2b30667c --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.h @@ -0,0 +1,3 @@ +#pragma once + +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm new file mode 100644 index 000000000..16a7aa676 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayer.mm @@ -0,0 +1,22 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +void* CreateMetalLayer(void* handle, float& scaleX, float& scaleY) +{ + NSView* view = (NSView*)handle; + + MetalView* childView = [[MetalView alloc] initWithFrame:view.bounds]; + childView.autoresizingMask = NSViewWidthSizable | NSViewHeightSizable; + childView.wantsLayer = YES; + + [view addSubview:childView]; + + const NSRect points = [childView frame]; + const NSRect pixels = [childView convertRectToBacking:points]; + + scaleX = (float)(pixels.size.width / points.size.width); + scaleY = (float)(pixels.size.height / points.size.height); + + return childView.layer; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp new file mode 100644 index 000000000..1155c1528 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.cpp @@ -0,0 +1,46 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayer.h" + +#include "gui/guiWrapper.h" + +MetalLayerHandle::MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow) +{ + const auto& windowInfo = (mainWindow ? gui_getWindowInfo().window_main : gui_getWindowInfo().window_pad); + + m_layer = (CA::MetalLayer*)CreateMetalLayer(windowInfo.handle, m_layerScaleX, m_layerScaleY); + m_layer->setDevice(device); + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); + m_layer->setFramebufferOnly(true); +} + +MetalLayerHandle::~MetalLayerHandle() +{ + if (m_layer) + m_layer->release(); +} + +void MetalLayerHandle::Resize(const Vector2i& size) +{ + m_layer->setDrawableSize(CGSize{(float)size.x * m_layerScaleX, (float)size.y * m_layerScaleY}); +} + +bool MetalLayerHandle::AcquireDrawable() +{ + if (m_drawable) + return true; + + m_drawable = m_layer->nextDrawable(); + if (!m_drawable) + { + cemuLog_log(LogType::Force, "layer {} failed to acquire next drawable", (void*)this); + return false; + } + + return true; +} + +void MetalLayerHandle::PresentDrawable(MTL::CommandBuffer* commandBuffer) +{ + commandBuffer->presentDrawable(m_drawable); + m_drawable = nullptr; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h new file mode 100644 index 000000000..014d2d432 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h @@ -0,0 +1,31 @@ +#pragma once + +#include + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "util/math/vector2.h" + +class MetalLayerHandle +{ +public: + MetalLayerHandle() = default; + MetalLayerHandle(MTL::Device* device, const Vector2i& size, bool mainWindow); + + ~MetalLayerHandle(); + + void Resize(const Vector2i& size); + + bool AcquireDrawable(); + + void PresentDrawable(MTL::CommandBuffer* commandBuffer); + + CA::MetalLayer* GetLayer() const { return m_layer; } + + CA::MetalDrawable* GetDrawable() const { return m_drawable; } + +private: + CA::MetalLayer* m_layer = nullptr; + float m_layerScaleX, m_layerScaleY; + + CA::MetalDrawable* m_drawable = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp new file mode 100644 index 000000000..7b1dd53fc --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -0,0 +1,101 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + +#include "Cemu/Logging/CemuLogging.h" +#include "Common/precompiled.h" +#include "HW/MMU/MMU.h" + +MetalMemoryManager::~MetalMemoryManager() +{ + if (m_bufferCache) + { + m_bufferCache->release(); + } +} + +void* MetalMemoryManager::AcquireTextureUploadBuffer(size_t size) +{ + if (m_textureUploadBuffer.size() < size) + { + m_textureUploadBuffer.resize(size); + } + + return m_textureUploadBuffer.data(); +} + +void MetalMemoryManager::ReleaseTextureUploadBuffer(uint8* mem) +{ + cemu_assert_debug(m_textureUploadBuffer.data() == mem); + m_textureUploadBuffer.clear(); +} + +void MetalMemoryManager::InitBufferCache(size_t size) +{ + cemu_assert_debug(!m_bufferCache); + + m_bufferCacheMode = g_current_game_profile->GetBufferCacheMode(); + + // First, try to import the host memory as a buffer + if (m_bufferCacheMode == BufferCacheMode::Host) + { + if (m_mtlr->HasUnifiedMemory()) + { + m_importedMemBaseAddress = mmuRange_MEM2.getBase(); + m_hostAllocationSize = mmuRange_MEM2.getSize(); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), m_hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (!m_bufferCache) + { + cemuLog_log(LogType::Force, "Failed to import host memory as a buffer, using device shared mode instead"); + m_bufferCacheMode = BufferCacheMode::DeviceShared; + } + } + else + { + cemuLog_log(LogType::Force, "Host buffer cache mode is only available on unified memory systems, using device shared mode instead"); + m_bufferCacheMode = BufferCacheMode::DeviceShared; + } + } + + if (!m_bufferCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_bufferCacheMode == BufferCacheMode::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); + +#ifdef CEMU_DEBUG_ASSERT + m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); +#endif +} + +void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) +{ + cemu_assert_debug(m_bufferCacheMode != BufferCacheMode::Host); + cemu_assert_debug(m_bufferCache); + cemu_assert_debug((offset + size) <= m_bufferCache->length()); + + if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) + { + auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder(); + + auto allocation = m_stagingAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, data, size); + m_stagingAllocator.FlushReservation(allocation); + + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size); + + //m_mtlr->CopyBufferToBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + } + else + { + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + } +} + +void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) +{ + cemu_assert_debug(m_bufferCacheMode != BufferCacheMode::Host); + cemu_assert_debug(m_bufferCache); + + if (m_bufferCacheMode == BufferCacheMode::DevicePrivate) + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + else + memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h new file mode 100644 index 000000000..4e55fa6f5 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -0,0 +1,76 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" + +#include "GameProfile/GameProfile.h" + +class MetalMemoryManager +{ +public: + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 32u * 1024 * 1024), m_indexAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 4u * 1024 * 1024) {} + ~MetalMemoryManager(); + + MetalSynchronizedRingAllocator& GetStagingAllocator() + { + return m_stagingAllocator; + } + + MetalSynchronizedHeapAllocator& GetIndexAllocator() + { + return m_indexAllocator; + } + + MTL::Buffer* GetBufferCache() + { + return m_bufferCache; + } + + void CleanupBuffers(MTL::CommandBuffer* latestFinishedCommandBuffer) + { + m_stagingAllocator.CleanupBuffer(latestFinishedCommandBuffer); + m_indexAllocator.CleanupBuffer(latestFinishedCommandBuffer); + } + + // Texture upload buffer + void* AcquireTextureUploadBuffer(size_t size); + void ReleaseTextureUploadBuffer(uint8* mem); + + // Buffer cache + void InitBufferCache(size_t size); + void UploadToBufferCache(const void* data, size_t offset, size_t size); + void CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size); + + // Getters + bool UseHostMemoryForCache() const + { + return (m_bufferCacheMode == BufferCacheMode::Host); + } + + bool NeedsReducedLatency() const + { + return (m_bufferCacheMode == BufferCacheMode::DeviceShared || m_bufferCacheMode == BufferCacheMode::Host); + } + + MPTR GetImportedMemBaseAddress() const + { + return m_importedMemBaseAddress; + } + + size_t GetHostAllocationSize() const + { + return m_hostAllocationSize; + } + +private: + class MetalRenderer* m_mtlr; + + std::vector m_textureUploadBuffer; + + MetalSynchronizedRingAllocator m_stagingAllocator; + MetalSynchronizedHeapAllocator m_indexAllocator; + + MTL::Buffer* m_bufferCache = nullptr; + BufferCacheMode m_bufferCacheMode; + MPTR m_importedMemBaseAddress; + size_t m_hostAllocationSize = 0; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp new file mode 100644 index 000000000..aec662bb7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp @@ -0,0 +1,37 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" + +MetalOutputShaderCache::~MetalOutputShaderCache() +{ + for (uint8 i = 0; i < METAL_OUTPUT_SHADER_CACHE_SIZE; i++) + { + if (m_cache[i]) + m_cache[i]->release(); + } +} + +MTL::RenderPipelineState* MetalOutputShaderCache::GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB) +{ + uint8 cacheIndex = (usesSRGB ? METAL_SHADER_TYPE_COUNT : 0) + shaderIndex; + auto& renderPipelineState = m_cache[cacheIndex]; + if (renderPipelineState) + return renderPipelineState; + + // Create a new render pipeline state + auto vertexShaderMtl = static_cast(shader->GetVertexShader())->GetFunction(); + auto fragmentShaderMtl = static_cast(shader->GetFragmentShader())->GetFunction(); + + auto renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexShaderMtl); + renderPipelineDescriptor->setFragmentFunction(fragmentShaderMtl); + renderPipelineDescriptor->colorAttachments()->object(0)->setPixelFormat(usesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + + NS::Error* error = nullptr; + renderPipelineState = m_mtlr->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + if (error) + { + cemuLog_log(LogType::Force, "error creating output render pipeline state: {}", error->localizedDescription()->utf8String()); + } + + return renderPipelineState; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h new file mode 100644 index 000000000..85b9e8b24 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h @@ -0,0 +1,20 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +constexpr uint8 METAL_SHADER_TYPE_COUNT = 6; +constexpr uint8 METAL_OUTPUT_SHADER_CACHE_SIZE = 2 * METAL_SHADER_TYPE_COUNT; + +class MetalOutputShaderCache +{ +public: + MetalOutputShaderCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalOutputShaderCache(); + + MTL::RenderPipelineState* GetPipeline(RendererOutputShader* shader, uint8 shaderIndex, bool usesSRGB); + +private: + class MetalRenderer* m_mtlr; + + MTL::RenderPipelineState* m_cache[METAL_OUTPUT_SHADER_CACHE_SIZE] = {nullptr}; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h new file mode 100644 index 000000000..bdbaa84b9 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h @@ -0,0 +1,26 @@ +#pragma once + +class MetalPerformanceMonitor +{ +public: + // Per frame data + uint32 m_commandBuffers = 0; + uint32 m_renderPasses = 0; + uint32 m_clears = 0; + uint32 m_manualVertexFetchDraws = 0; + uint32 m_meshDraws = 0; + uint32 m_triangleFans = 0; + + MetalPerformanceMonitor() = default; + ~MetalPerformanceMonitor() = default; + + void ResetPerFrameData() + { + m_commandBuffers = 0; + m_renderPasses = 0; + m_clears = 0; + m_manualVertexFetchDraws = 0; + m_meshDraws = 0; + m_triangleFans = 0; + } +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp new file mode 100644 index 000000000..a922365b2 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp @@ -0,0 +1,621 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Common/RegisterSerializer.h" +#include "Cafe/HW/Latte/Core/LatteShaderCache.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cemu/FileCache/FileCache.h" +#include "Common/precompiled.h" +#include "util/helpers/helpers.h" +#include "config/ActiveSettings.h" + +#include + +static bool g_compilePipelineThreadInit{false}; +static std::mutex g_compilePipelineMutex; +static std::condition_variable g_compilePipelineCondVar; +static std::queue g_compilePipelineRequests; + +static void compileThreadFunc(sint32 threadIndex) +{ + SetThreadName("compilePl"); + + // one thread runs at normal priority while the others run at lower priority + if (threadIndex != 0) + ; // TODO: set thread priority + + while (true) + { + std::unique_lock lock(g_compilePipelineMutex); + while (g_compilePipelineRequests.empty()) + g_compilePipelineCondVar.wait(lock); + + MetalPipelineCompiler* request = g_compilePipelineRequests.front(); + + g_compilePipelineRequests.pop(); + + lock.unlock(); + + request->Compile(true, false, true); + delete request; + } +} + +static void initCompileThread() +{ + uint32 numCompileThreads; + + uint32 cpuCoreCount = GetPhysicalCoreCount(); + if (cpuCoreCount <= 2) + numCompileThreads = 1; + else + numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3 + + numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8 + + for (uint32 i = 0; i < numCompileThreads; i++) + { + std::thread compileThread(compileThreadFunc, i); + compileThread.detach(); + } +} + +static void queuePipeline(MetalPipelineCompiler* v) +{ + std::unique_lock lock(g_compilePipelineMutex); + g_compilePipelineRequests.push(std::move(v)); + lock.unlock(); + g_compilePipelineCondVar.notify_one(); +} + +// make a guess if a pipeline is not essential +// non-essential means that skipping these drawcalls shouldn't lead to permanently corrupted graphics +bool IsAsyncPipelineAllowed(const MetalAttachmentsInfo& attachmentsInfo, Vector2i extend, uint32 indexCount) +{ + if (extend.x == 1600 && extend.y == 1600) + return false; // Splatoon ink mechanics use 1600x1600 R8 and R8G8 framebuffers, this resolution is rare enough that we can just blacklist it globally + + if (attachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + return true; // aggressive filter but seems to work well so far + + // small index count (3,4,5,6) is often associated with full-viewport quads (which are considered essential due to often being used to generate persistent textures) + if (indexCount <= 6) + return false; + + return true; +} + +MetalPipelineCache* g_mtlPipelineCache = nullptr; + +MetalPipelineCache& MetalPipelineCache::GetInstance() +{ + return *g_mtlPipelineCache; +} + +MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} +{ + g_mtlPipelineCache = this; +} + +MetalPipelineCache::~MetalPipelineCache() +{ + for (auto& [key, pipelineObj] : m_pipelineCache) + { + pipelineObj->m_pipeline->release(); + delete pipelineObj; + } +} + +PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr) +{ + uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + PipelineObject*& pipelineObj = m_pipelineCache[hash]; + if (pipelineObj) + return pipelineObj; + + pipelineObj = new PipelineObject(); + + MetalPipelineCompiler* compiler = new MetalPipelineCompiler(m_mtlr, *pipelineObj); + compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + + bool allowAsyncCompile = false; + if (GetConfig().async_compile) + allowAsyncCompile = IsAsyncPipelineAllowed(activeAttachmentsInfo, extend, indexCount); + + if (allowAsyncCompile) + { + if (!g_compilePipelineThreadInit) + { + initCompileThread(); + g_compilePipelineThreadInit = true; + } + + queuePipeline(compiler); + } + else + { + // Also force compile to ensure that the pipeline is ready + cemu_assert_debug(compiler->Compile(true, true, true)); + delete compiler; + } + + // Save to cache + AddCurrentStateToCache(hash, lastUsedAttachmentsInfo); + + return pipelineObj; +} + +uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Hash + uint64 stateHash = 0; + for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i) + { + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + continue; + + stateHash += GetMtlPixelFormat(format, false) + i * 31; + stateHash = std::rotl(stateHash, 7); + + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } + } + + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); + stateHash = std::rotl(stateHash, 7); + + if (activeAttachmentsInfo.depthFormat == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + stateHash += 1; + stateHash = std::rotl(stateHash, 1); + } + } + + for (auto& group : fetchShader->bufferGroups) + { + uint32 bufferStride = group.getCurrentBufferStride(lcr.GetRawView()); + stateHash = std::rotl(stateHash, 7); + stateHash += bufferStride * 3; + } + + stateHash += fetchShader->getVkPipelineHashFragment(); + stateHash = std::rotl(stateHash, 7); + + stateHash += lcr.GetRawView()[mmVGT_STRMOUT_EN]; + stateHash = std::rotl(stateHash, 7); + + if(lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL()) + stateHash += 0x333333; + + stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; + + uint32* ctxRegister = lcr.GetRawView(); + + if (vertexShader) + stateHash += vertexShader->baseHash; + + stateHash = std::rotl(stateHash, 13); + + if (pixelShader) + stateHash += pixelShader->baseHash + pixelShader->auxHash; + + stateHash = std::rotl(stateHash, 13); + + uint32 polygonCtrl = lcr.PA_SU_SC_MODE_CNTL.getRawValue(); + stateHash += polygonCtrl; + stateHash = std::rotl(stateHash, 7); + + stateHash += ctxRegister[Latte::REGADDR::PA_CL_CLIP_CNTL]; + stateHash = std::rotl(stateHash, 7); + + const auto colorControlReg = ctxRegister[Latte::REGADDR::CB_COLOR_CONTROL]; + stateHash += colorControlReg; + + stateHash += ctxRegister[Latte::REGADDR::CB_TARGET_MASK]; + + const uint32 blendEnableMask = (colorControlReg >> 8) & 0xFF; + if (blendEnableMask) + { + for (auto i = 0; i < 8; ++i) + { + if (((blendEnableMask & (1 << i))) == 0) + continue; + stateHash = std::rotl(stateHash, 7); + stateHash += ctxRegister[Latte::REGADDR::CB_BLEND0_CONTROL + i]; + } + } + + // Mesh pipeline + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + + if (usesGeometryShader) + { + stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE]; + stateHash = std::rotl(stateHash, 7); + } + + return stateHash; +} + +struct +{ + uint32 pipelineLoadIndex; + uint32 pipelineMaxFileIndex; + + std::atomic_uint32_t pipelinesQueued; + std::atomic_uint32_t pipelinesLoaded; +} g_mtlCacheState; + +uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId) +{ + std::error_code ec; + fs::create_directories(ActiveSettings::GetCachePath("shaderCache/transferable"), ec); + const auto pathCacheFile = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlpipeline.bin", cacheTitleId); + + // init cache loader state + g_mtlCacheState.pipelineLoadIndex = 0; + g_mtlCacheState.pipelineMaxFileIndex = 0; + g_mtlCacheState.pipelinesLoaded = 0; + g_mtlCacheState.pipelinesQueued = 0; + + // start async compilation threads + m_compilationCount.store(0); + m_compilationQueue.clear(); + + // get core count + uint32 cpuCoreCount = GetPhysicalCoreCount(); + m_numCompilationThreads = std::clamp(cpuCoreCount, 1u, 8u); + // TODO: uncomment? + //if (VulkanRenderer::GetInstance()->GetDisableMultithreadedCompilation()) + // m_numCompilationThreads = 1; + + for (uint32 i = 0; i < m_numCompilationThreads; i++) + { + std::thread compileThread(&MetalPipelineCache::CompilerThread, this); + compileThread.detach(); + } + + // open cache file or create it + cemu_assert_debug(s_cache == nullptr); + s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId)); + if (!s_cache) + { + cemuLog_log(LogType::Force, "Failed to open or create Metal pipeline cache file: {}", _pathToUtf8(pathCacheFile)); + return 0; + } + else + { + s_cache->UseCompression(false); + g_mtlCacheState.pipelineMaxFileIndex = s_cache->GetMaximumFileIndex(); + } + return s_cache->GetFileCount(); +} + +bool MetalPipelineCache::UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders) +{ + pipelinesLoadedTotal = g_mtlCacheState.pipelinesLoaded; + pipelinesMissingShaders = 0; + while (g_mtlCacheState.pipelineLoadIndex <= g_mtlCacheState.pipelineMaxFileIndex) + { + if (m_compilationQueue.size() >= 50) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // queue up to 50 entries at a time + } + + uint64 fileNameA, fileNameB; + std::vector fileData; + if (s_cache->GetFileByIndex(g_mtlCacheState.pipelineLoadIndex, &fileNameA, &fileNameB, fileData)) + { + // queue for async compilation + g_mtlCacheState.pipelinesQueued++; + m_compilationQueue.push(std::move(fileData)); + g_mtlCacheState.pipelineLoadIndex++; + return true; + } + g_mtlCacheState.pipelineLoadIndex++; + } + if (g_mtlCacheState.pipelinesLoaded != g_mtlCacheState.pipelinesQueued) + { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return true; // pipelines still compiling + } + return false; // done +} + +void MetalPipelineCache::EndLoading() +{ + // shut down compilation threads + uint32 threadCount = m_numCompilationThreads; + m_numCompilationThreads = 0; // signal thread shutdown + for (uint32 i = 0; i < threadCount; i++) + { + m_compilationQueue.push({}); // push empty workload for every thread. Threads then will shutdown after checking for m_numCompilationThreads == 0 + } + // keep cache file open for writing of new pipelines +} + +void MetalPipelineCache::Close() +{ + if(s_cache) + { + delete s_cache; + s_cache = nullptr; + } +} + +struct CachedPipeline +{ + struct ShaderHash + { + uint64 baseHash; + uint64 auxHash; + bool isPresent{}; + + void set(uint64 baseHash, uint64 auxHash) + { + this->baseHash = baseHash; + this->auxHash = auxHash; + this->isPresent = true; + } + }; + + ShaderHash vsHash; // includes fetch shader + ShaderHash gsHash; + ShaderHash psHash; + + MetalAttachmentsInfo lastUsedAttachmentsInfo; + + Latte::GPUCompactedRegisterState gpuState; +}; + +void MetalPipelineCache::LoadPipelineFromCache(std::span fileData) +{ + static FSpinlock s_spinlockSharedInternal; + + // deserialize file + LatteContextRegister* lcr = new LatteContextRegister(); + s_spinlockSharedInternal.lock(); + CachedPipeline* cachedPipeline = new CachedPipeline(); + s_spinlockSharedInternal.unlock(); + + MemStreamReader streamReader(fileData.data(), fileData.size()); + if (!DeserializePipeline(streamReader, *cachedPipeline)) + { + // failed to deserialize + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); + return; + } + // restored register view from compacted state + Latte::LoadGPURegisterState(*lcr, cachedPipeline->gpuState); + + LatteDecompilerShader* vertexShader = nullptr; + LatteDecompilerShader* geometryShader = nullptr; + LatteDecompilerShader* pixelShader = nullptr; + // find vertex shader + if (cachedPipeline->vsHash.isPresent) + { + vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash); + if (!vertexShader) + { + cemuLog_log(LogType::Force, "Vertex shader not found in cache"); + return; + } + } + // find geometry shader + if (cachedPipeline->gsHash.isPresent) + { + geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash); + if (!geometryShader) + { + cemuLog_log(LogType::Force, "Geometry shader not found in cache"); + return; + } + } + // find pixel shader + if (cachedPipeline->psHash.isPresent) + { + pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash); + if (!pixelShader) + { + cemuLog_log(LogType::Force, "Pixel shader not found in cache"); + return; + } + } + + if (!pixelShader) + { + cemu_assert_debug(false); + return; + } + + MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader); + + PipelineObject* pipelineObject = new PipelineObject(); + + // compile + { + MetalPipelineCompiler pp(m_mtlr, *pipelineObject); + pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); + pp.Compile(true, true, false); + // destroy pp early + } + + // Cache the pipeline + uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr); + m_pipelineCacheLock.lock(); + m_pipelineCache[pipelineStateHash] = pipelineObject; + m_pipelineCacheLock.unlock(); + + // clean up + s_spinlockSharedInternal.lock(); + delete lcr; + delete cachedPipeline; + s_spinlockSharedInternal.unlock(); +} + +ConcurrentQueue g_mtlPipelineCachingQueue; + +void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash, const MetalAttachmentsInfo& lastUsedAttachmentsInfo) +{ + if (!m_pipelineCacheStoreThread) + { + m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this); + m_pipelineCacheStoreThread->detach(); + } + // fill job structure with cached GPU state + // for each cached pipeline we store: + // - Active shaders (referenced by hash) + // - An almost-complete register state of the GPU (minus some ALU uniform constants which aren't relevant) + CachedPipeline* job = new CachedPipeline(); + auto vs = LatteSHRC_GetActiveVertexShader(); + auto gs = LatteSHRC_GetActiveGeometryShader(); + auto ps = LatteSHRC_GetActivePixelShader(); + if (vs) + job->vsHash.set(vs->baseHash, vs->auxHash); + if (gs) + job->gsHash.set(gs->baseHash, gs->auxHash); + if (ps) + job->psHash.set(ps->baseHash, ps->auxHash); + job->lastUsedAttachmentsInfo = lastUsedAttachmentsInfo; + Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState); + // queue job + g_mtlPipelineCachingQueue.push(job); +} + +bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPipeline& cachedPipeline) +{ + memWriter.writeBE(0x01); // version + uint8 presentMask = 0; + if (cachedPipeline.vsHash.isPresent) + presentMask |= 1; + if (cachedPipeline.gsHash.isPresent) + presentMask |= 2; + if (cachedPipeline.psHash.isPresent) + presentMask |= 4; + memWriter.writeBE(presentMask); + if (cachedPipeline.vsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.vsHash.baseHash); + memWriter.writeBE(cachedPipeline.vsHash.auxHash); + } + if (cachedPipeline.gsHash.isPresent) + { + memWriter.writeBE(cachedPipeline.gsHash.baseHash); + memWriter.writeBE(cachedPipeline.gsHash.auxHash); + } + if (cachedPipeline.psHash.isPresent) + { + memWriter.writeBE(cachedPipeline.psHash.baseHash); + memWriter.writeBE(cachedPipeline.psHash.auxHash); + } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i]); + memWriter.writeBE((uint16)cachedPipeline.lastUsedAttachmentsInfo.depthFormat); + + Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter); + + return true; +} + +bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedPipeline& cachedPipeline) +{ + // version + if (memReader.readBE() != 1) + { + cemuLog_log(LogType::Force, "Cached Metal pipeline corrupted or has unknown version"); + return false; + } + // shader hashes + uint8 presentMask = memReader.readBE(); + if (presentMask & 1) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.vsHash.set(baseHash, auxHash); + } + if (presentMask & 2) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.gsHash.set(baseHash, auxHash); + } + if (presentMask & 4) + { + uint64 baseHash = memReader.readBE(); + uint64 auxHash = memReader.readBE(); + cachedPipeline.psHash.set(baseHash, auxHash); + } + + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i] = (Latte::E_GX2SURFFMT)memReader.readBE(); + cachedPipeline.lastUsedAttachmentsInfo.depthFormat = (Latte::E_GX2SURFFMT)memReader.readBE(); + + // deserialize GPU state + if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader)) + { + return false; + } + cemu_assert_debug(!memReader.hasError()); + + return true; +} + +int MetalPipelineCache::CompilerThread() +{ + SetThreadName("plCacheCompiler"); + while (m_numCompilationThreads != 0) + { + std::vector pipelineData = m_compilationQueue.pop(); + if(pipelineData.empty()) + continue; + LoadPipelineFromCache(pipelineData); + ++g_mtlCacheState.pipelinesLoaded; + } + return 0; +} + +void MetalPipelineCache::WorkerThread() +{ + SetThreadName("plCacheWriter"); + while (true) + { + CachedPipeline* job; + g_mtlPipelineCachingQueue.pop(job); + if (!s_cache) + { + delete job; + continue; + } + // serialize + MemStreamWriter memWriter(1024 * 4); + SerializePipeline(memWriter, *job); + auto blob = memWriter.getResult(); + // file name is derived from data hash + uint8 hash[SHA256_DIGEST_LENGTH]; + SHA256(blob.data(), blob.size(), hash); + uint64 nameA = *(uint64be*)(hash + 0); + uint64 nameB = *(uint64be*)(hash + 8); + s_cache->AddFileAsync({ nameA, nameB }, blob.data(), blob.size()); + delete job; + } +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h new file mode 100644 index 000000000..270c2db72 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h @@ -0,0 +1,52 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/fspinlock.h" +#include "util/math/vector2.h" + +class MetalPipelineCache +{ +public: + static MetalPipelineCache& GetInstance(); + + MetalPipelineCache(class MetalRenderer* metalRenderer); + ~MetalPipelineCache(); + + PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr); + + // Cache loading + uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache + bool UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders); + void EndLoading(); + void LoadPipelineFromCache(std::span fileData); + void Close(); // called on title exit + + // Debug + size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); } + +private: + class MetalRenderer* m_mtlr; + + std::map m_pipelineCache; + FSpinlock m_pipelineCacheLock; + + std::thread* m_pipelineCacheStoreThread; + + class FileCache* s_cache; + + std::atomic_uint32_t m_numCompilationThreads{ 0 }; + ConcurrentQueue> m_compilationQueue; + std::atomic_uint32_t m_compilationCount; + + static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + void AddCurrentStateToCache(uint64 pipelineStateHash, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo); + + // pipeline serialization for file + bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline); + bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline); + + int CompilerThread(); + void WorkerThread(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp new file mode 100644 index 000000000..fb92727d9 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp @@ -0,0 +1,495 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" + +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "Cafe/HW/Latte/Core/LatteShader.h" + +#include + +extern std::atomic_int g_compiling_pipelines; +extern std::atomic_int g_compiling_pipelines_async; +extern std::atomic_uint64_t g_compiling_pipelines_syncTimeSum; + +static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx)); + gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx)); +} + +static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, const char* variant, const LatteContextRegister& latteRegister) +{ + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + // make sure PS has matching input + if (!psInputTable.hasPSImportForSemanticId(vsSemanticId)) + continue; + gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId)); + } + gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant)); + gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n")); +} + +static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister) +{ + sint32 pList[4] = { p0, p1, p2, p3 }; + for (sint32 i = 0; i < 4; i++) + { + if (pList[i] == 3) + rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister); + else + rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister); + } + gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0])); + gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1])); + gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2])); + gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3])); +} + +static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister) +{ + std::string gsSrc; + gsSrc.append("#include \r\n"); + gsSrc.append("using namespace metal;\r\n"); + + LatteShaderPSInputTable psInputTable; + LatteShader_CreatePSInputTable(&psInputTable, latteRegister.GetRawView()); + + // inputs & outputs + std::string vertexOutDefinition = "struct VertexOut {\r\n"; + vertexOutDefinition += "float4 position;\r\n"; + std::string geometryOutDefinition = "struct GeometryOut {\r\n"; + geometryOutDefinition += "float4 position [[position]];\r\n"; + auto parameterMask = vertexShader->outputParameterMask; + for (uint32 i = 0; i < 32; i++) + { + if ((parameterMask & (1 << i)) == 0) + continue; + sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i); + if (vsSemanticId < 0) + continue; + auto psImport = psInputTable.getPSImportBySemanticId(vsSemanticId); + if (psImport == nullptr) + continue; + + // VertexOut + vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId); + + // GeometryOut + geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId); + + geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable.getPSImportLocationBySemanticId(vsSemanticId)); + if (psImport->isFlat) + geometryOutDefinition += " [[flat]]"; + if (psImport->isNoPerspective) + geometryOutDefinition += " [[center_no_perspective]]"; + geometryOutDefinition += ";\r\n"; + } + vertexOutDefinition += "};\r\n"; + geometryOutDefinition += "};\r\n"; + + gsSrc.append(vertexOutDefinition); + gsSrc.append(geometryOutDefinition); + + gsSrc.append("struct ObjectPayload {\r\n"); + gsSrc.append("VertexOut vertexOut[3];\r\n"); + gsSrc.append("};\r\n"); + + // gen function + gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return b - (c - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c - (b - a);\r\n"); + gsSrc.append("}\r\n"); + + gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("return c + (b - a);\r\n"); + gsSrc.append("}\r\n"); + + // main + gsSrc.append("using MeshType = mesh;\r\n"); + gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n"); + gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n"); + gsSrc.append("{\r\n"); + gsSrc.append("GeometryOut out;\r\n"); + + // there are two possible winding orders that need different triangle generation: + // 0 1 + // 2 3 + // and + // 0 1 + // 3 2 + // all others are just symmetries of these cases + + // we can determine the case by comparing the distance 0<->1 and 0<->2 + + gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n"); + gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n"); + + // emit vertices + gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n"); + gsSrc.append("{\r\n"); + // p0 to p1 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister); + gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n"); + // p0 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister); + gsSrc.append("} else {\r\n"); + // p1 to p2 is diagonal + rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister); + gsSrc.append("}\r\n"); + + gsSrc.append("mesh.set_primitive_count(2);\r\n"); + + gsSrc.append("}\r\n"); + + auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc); + mtlShader->PreponeCompilation(true); + + return mtlShader; +} + +#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF + +uint64 s_cacheTitleId = INVALID_TITLE_ID; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +template +void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr) +{ + // TODO: check if the pixel shader is valid as well? + if (!rasterizationEnabled/* || !pixelShaderMtl*/) + { + desc->setRasterizationEnabled(false); + return; + } + + // Color attachments + const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL; + uint32 blendEnableMask = colorControlReg.get_BLEND_MASK(); + uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK(); + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i]; + if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT) + continue; + + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(format, false); + auto colorAttachment = desc->colorAttachments()->object(i); + colorAttachment->setPixelFormat(pixelFormat); + + // Disable writes if not in the active FBO + if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + colorAttachment->setWriteMask(MTL::ColorWriteMaskNone); + continue; + } + + colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF)); + + // Blending + bool blendEnabled = ((blendEnableMask & (1 << i))) != 0; + // Only float data type is blendable + if (blendEnabled && GetMtlPixelFormatInfo(format, false).dataType == MetalDataType::FLOAT) + { + colorAttachment->setBlendingEnabled(true); + + const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i]; + + auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN()); + auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND()); + auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND()); + + colorAttachment->setRgbBlendOperation(rgbBlendOp); + colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor); + if (blendControlReg.get_SEPARATE_ALPHA_BLEND()) + { + colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN())); + colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND())); + colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND())); + } + else + { + colorAttachment->setAlphaBlendOperation(rgbBlendOp); + colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor); + colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor); + } + } + } + + // Depth stencil attachment + if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT) + { + MTL::PixelFormat pixelFormat = GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true); + desc->setDepthAttachmentPixelFormat(pixelFormat); + if (lastUsedAttachmentsInfo.hasStencil) + desc->setStencilAttachmentPixelFormat(pixelFormat); + } +} + +MetalPipelineCompiler::~MetalPipelineCompiler() +{ + /* + for (auto& pair : m_pipelineCache) + { + pair.second->release(); + } + m_pipelineCache.clear(); + + NS::Error* error = nullptr; + m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error); + if (error) + { + cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String()); + error->release(); + } + m_binaryArchive->release(); + + m_binaryArchiveURL->release(); + */ + m_pipelineDescriptor->release(); +} + +void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Check if the pipeline uses a geometry shader + const LattePrimitiveMode primitiveMode = static_cast(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE()); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + + // Rasterization + m_rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // HACK + // TODO: include this in the hash? + if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + m_rasterizationEnabled = true; + + // Culling both front and back faces effectively disables rasterization + const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL; + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + if (cullFront && cullBack) + m_rasterizationEnabled = false; + + // Shaders + m_vertexShaderMtl = static_cast(vertexShader->shader); + if (geometryShader) + m_geometryShaderMtl = static_cast(geometryShader->shader); + else if (isPrimitiveRect) + m_geometryShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr); + else + m_geometryShaderMtl = nullptr; + m_pixelShaderMtl = static_cast(pixelShader->shader); + + if (m_usesGeometryShader) + InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); + else + InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr); +} + +bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay) +{ + if (forceCompile) + { + // if some shader stages are not compiled yet, compile them now + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + m_vertexShaderMtl->PreponeCompilation(isRenderThread); + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + m_geometryShaderMtl->PreponeCompilation(isRenderThread); + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + m_pixelShaderMtl->PreponeCompilation(isRenderThread); + } + else + { + // fail early if some shader stages are not compiled + if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled()) + return false; + if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled()) + return false; + if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled()) + return false; + } + + // Compile + MTL::RenderPipelineState* pipeline = nullptr; + NS::Error* error = nullptr; + + auto start = std::chrono::high_resolution_clock::now(); + if (m_usesGeometryShader) + { + auto desc = static_cast(m_pipelineDescriptor); + + // Shaders + desc->setObjectFunction(m_vertexShaderMtl->GetFunction()); + desc->setMeshFunction(m_geometryShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Mesh render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + } + else + { + auto desc = static_cast(m_pipelineDescriptor); + + // Shaders + desc->setVertexFunction(m_vertexShaderMtl->GetFunction()); + if (m_rasterizationEnabled) + desc->setFragmentFunction(m_pixelShaderMtl->GetFunction()); + +#ifdef CEMU_DEBUG_ASSERT + desc->setLabel(GetLabel("Render pipeline state", desc)); +#endif + pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error); + } + auto end = std::chrono::high_resolution_clock::now(); + + auto creationDuration = std::chrono::duration_cast(end - start).count(); + + if (error) + { + cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String()); + } + + if (showInOverlay) + { + if (isRenderThread) + g_compiling_pipelines_syncTimeSum += creationDuration; + else + g_compiling_pipelines_async++; + g_compiling_pipelines++; + } + + m_pipelineObj.m_pipeline = pipeline; + + return true; +} + +void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Render pipeline state + MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init(); + + // Vertex descriptor + if (!fetchShader->mtlFetchVertexManually) + { + MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init(); + for (auto& bufferGroup : fetchShader->bufferGroups) + { + std::optional fetchType; + + uint32 minBufferStride = 0; + for (sint32 j = 0; j < bufferGroup.attribCount; ++j) + { + auto& attr = bufferGroup.attrib[j]; + + uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId]; + if (semanticId == (uint32)-1) + continue; // attribute not used? + + auto attribute = vertexDescriptor->attributes()->object(semanticId); + attribute->setOffset(attr.offset); + attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex)); + attribute->setFormat(GetMtlVertexFormat(attr.format)); + + minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format)); + + if (fetchType.has_value()) + cemu_assert_debug(fetchType == attr.fetchType); + else + fetchType = attr.fetchType; + + if (attr.fetchType == LatteConst::INSTANCE_DATA) + { + cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported + } + } + + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex)); + if (bufferStride == 0) + { + // Buffer stride cannot be zero, let's use the minimum stride + bufferStride = minBufferStride; + + // Additionally, constant vertex function must be used + layout->setStepFunction(MTL::VertexStepFunctionConstant); + layout->setStepRate(0); + } + else + { + if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerVertex); + else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA) + layout->setStepFunction(MTL::VertexStepFunctionPerInstance); + else + { + cemuLog_log(LogType::Force, "unimplemented vertex fetch type {}", (uint32)fetchType.value()); + cemu_assert(false); + } + } + bufferStride = Align(bufferStride, 4); + layout->setStride(bufferStride); + } + + desc->setVertexDescriptor(vertexDescriptor); + vertexDescriptor->release(); + } + + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); + + m_pipelineDescriptor = desc; +} + +void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr) +{ + // Render pipeline state + MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init(); + + SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr); + + m_pipelineDescriptor = desc; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h new file mode 100644 index 000000000..5006ed595 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h @@ -0,0 +1,38 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" + +#include "Cafe/HW/Latte/ISA/LatteReg.h" +#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" + +struct PipelineObject +{ + MTL::RenderPipelineState* m_pipeline = nullptr; +}; + +class MetalPipelineCompiler +{ +public: + MetalPipelineCompiler(class MetalRenderer* metalRenderer, PipelineObject& pipelineObj) : m_mtlr{metalRenderer}, m_pipelineObj{pipelineObj} {} + ~MetalPipelineCompiler(); + + void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + bool Compile(bool forceCompile, bool isRenderThread, bool showInOverlay); + +private: + class MetalRenderer* m_mtlr; + PipelineObject& m_pipelineObj; + + class RendererShaderMtl* m_vertexShaderMtl; + class RendererShaderMtl* m_geometryShaderMtl; + class RendererShaderMtl* m_pixelShaderMtl; + bool m_usesGeometryShader; + bool m_rasterizationEnabled; + + NS::Object* m_pipelineDescriptor; + + void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); + + void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp new file mode 100644 index 000000000..ee79f2dd8 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.cpp @@ -0,0 +1,38 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" + +bool LatteQueryObjectMtl::getResult(uint64& numSamplesPassed) +{ + if (m_commandBuffer && !CommandBufferCompleted(m_commandBuffer)) + return false; + + uint64* resultPtr = m_mtlr->GetOcclusionQueryResultsPtr(); + + numSamplesPassed = 0; + for (uint32 i = m_range.begin; i != m_range.end; i = (i + 1) % MetalRenderer::OCCLUSION_QUERY_POOL_SIZE) + numSamplesPassed += resultPtr[i]; + + return true; +} + +LatteQueryObjectMtl::~LatteQueryObjectMtl() +{ + if (m_commandBuffer) + m_commandBuffer->release(); +} + +void LatteQueryObjectMtl::begin() +{ + m_range.begin = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->BeginOcclusionQuery(); +} + +void LatteQueryObjectMtl::end() +{ + m_range.end = m_mtlr->GetOcclusionQueryIndex(); + m_mtlr->EndOcclusionQuery(); + + m_commandBuffer = m_mtlr->GetAndRetainCurrentCommandBufferIfNotCompleted(); + if (m_commandBuffer) + m_mtlr->RequestSoonCommit(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h new file mode 100644 index 000000000..3de0939a0 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalQuery.h @@ -0,0 +1,28 @@ +#pragma once + +#include "Cafe/HW/Latte/Core/LatteQueryObject.h" + +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +class LatteQueryObjectMtl : public LatteQueryObject +{ +public: + LatteQueryObjectMtl(class MetalRenderer* mtlRenderer) : m_mtlr{mtlRenderer} {} + ~LatteQueryObjectMtl(); + + bool getResult(uint64& numSamplesPassed) override; + void begin() override; + void end() override; + + void GrowRange() + { + m_range.end++; + } + +private: + class MetalRenderer* m_mtlr; + + MetalQueryRange m_range = {INVALID_UINT32, INVALID_UINT32}; + // TODO: make this a list of command buffers? + MTL::CommandBuffer* m_commandBuffer = nullptr; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp new file mode 100644 index 000000000..61e5c94a7 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -0,0 +1,2253 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" +#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" + +#include "Cafe/HW/Latte/Core/LatteShader.h" +#include "Cafe/HW/Latte/Core/LatteIndices.h" +#include "Cafe/HW/Latte/Core/LatteBufferCache.h" +#include "Cemu/Logging/CemuLogging.h" +#include "Cafe/HW/Latte/Core/FetchShader.h" +#include "Cafe/HW/Latte/Core/LatteConst.h" +#include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h" +#include "HW/Latte/Renderer/Metal/MetalCommon.h" +#include "config/CemuConfig.h" +#include "gui/guiWrapper.h" + +#define IMGUI_IMPL_METAL_CPP +#include "imgui/imgui_extension.h" +#include "imgui/imgui_impl_metal.h" + +#define EVENT_VALUE_WRAP 4096 + +extern bool hasValidFramebufferAttached; + +float supportBufferData[512 * 4]; + +// Defined in the OpenGL renderer +void LatteDraw_handleSpecialState8_clearAsDepth(); + +std::vector MetalRenderer::GetDevices() +{ + auto devices = MTL::CopyAllDevices(); + std::vector result; + result.reserve(devices->count()); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + result.emplace_back(std::string(device->name()->utf8String()), device->registryID()); + } + + return result; +} + +MetalRenderer::MetalRenderer() +{ + // Pick a device + auto& config = GetConfig(); + const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0; + + // If a device is set, try to find it + if (hasDeviceSet) + { + auto devices = MTL::CopyAllDevices(); + for (uint32 i = 0; i < devices->count(); i++) + { + MTL::Device* device = static_cast(devices->object(i)); + if (device->registryID() == config.mtl_graphic_device_uuid) + { + m_device = device; + break; + } + } + } + + if (!m_device) + { + if (hasDeviceSet) + { + cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.mtl_graphic_device_uuid); + config.mtl_graphic_device_uuid = 0; + } + // Use the system default device + m_device = MTL::CreateSystemDefaultDevice(); + } + + // Feature support + m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); + m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false; + m_hasUnifiedMemory = m_device->hasUnifiedMemory(); + m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); + m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); + m_pixelFormatSupport = MetalPixelFormatSupport(m_device); + + CheckForPixelFormatSupport(m_pixelFormatSupport); + + // Command queue + m_commandQueue = m_device->newCommandQueue(); + + // Synchronization resources + m_event = m_device->newEvent(); + + // Resources + MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor)); +#endif + m_nearestSampler = m_device->newSamplerState(samplerDescriptor); + + samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear); + samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear); +#ifdef CEMU_DEBUG_ASSERT + samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor)); +#endif + m_linearSampler = m_device->newSamplerState(samplerDescriptor); + samplerDescriptor->release(); + + // Null resources + MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); + textureDescriptor->setTextureType(MTL::TextureType1D); + textureDescriptor->setWidth(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead); + m_nullTexture1D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); +#endif + + textureDescriptor->setTextureType(MTL::TextureType2D); + textureDescriptor->setHeight(1); + textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget); + m_nullTexture2D = m_device->newTexture(textureDescriptor); +#ifdef CEMU_DEBUG_ASSERT + m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); +#endif + textureDescriptor->release(); + + m_memoryManager = new MetalMemoryManager(this); + m_outputShaderCache = new MetalOutputShaderCache(this); + m_pipelineCache = new MetalPipelineCache(this); + m_depthStencilCache = new MetalDepthStencilCache(this); + m_samplerCache = new MetalSamplerCache(this); + + // Lower the commit treshold when buffer cache needs reduced latency + if (m_memoryManager->NeedsReducedLatency()) + m_defaultCommitTreshlod = 64; + else + m_defaultCommitTreshlod = 196; + + // Occlusion queries + m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_occlusionQuery.m_resultBuffer->setLabel(GetLabel("Occlusion query result buffer", m_occlusionQuery.m_resultBuffer)); +#endif + m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); + + // Reset vertex and uniform buffers + for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; + } + + // Utility shader library + + // Create the library + NS::Error* error = nullptr; + MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); + } + + // Pipelines + MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); + MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); + + m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init(); + m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction); + m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction); + vertexFullscreenFunction->release(); + fragmentCopyDepthToColorFunction->release(); + + // Void vertex pipelines + if (m_isAppleGPU) + m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); + + utilityLibrary->release(); + + // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; + m_occlusionQuery.m_lastCommandBuffer = nullptr; + m_captureFrame = false; +} + +MetalRenderer::~MetalRenderer() +{ + if (m_isAppleGPU) + delete m_copyBufferToBufferPipeline; + //delete m_copyTextureToTexturePipeline; + //delete m_restrideBufferPipeline; + + m_copyDepthToColorDesc->release(); + for (const auto [pixelFormat, pipeline] : m_copyDepthToColorPipelines) + pipeline->release(); + + delete m_outputShaderCache; + delete m_pipelineCache; + delete m_depthStencilCache; + delete m_samplerCache; + delete m_memoryManager; + + m_nullTexture1D->release(); + m_nullTexture2D->release(); + + m_nearestSampler->release(); + m_linearSampler->release(); + + if (m_readbackBuffer) + m_readbackBuffer->release(); + + if (m_xfbRingBuffer) + m_xfbRingBuffer->release(); + + m_occlusionQuery.m_resultBuffer->release(); + + m_event->release(); + + m_commandQueue->release(); + m_device->release(); +} + +void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) +{ + auto& layer = GetLayer(mainWindow); + layer = MetalLayerHandle(m_device, size, mainWindow); + layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm); +} + +void MetalRenderer::ShutdownLayer(bool mainWindow) +{ + GetLayer(mainWindow) = MetalLayerHandle(); +} + +void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) +{ + GetLayer(mainWindow).Resize(size); +} + +void MetalRenderer::Initialize() +{ + Renderer::Initialize(); + RendererShaderMtl::Initialize(); +} + +void MetalRenderer::Shutdown() +{ + // TODO: should shutdown both layers + ImGui_ImplMetal_Shutdown(); + CommitCommandBuffer(); + Renderer::Shutdown(); + RendererShaderMtl::Shutdown(); +} + +bool MetalRenderer::IsPadWindowActive() +{ + return (GetLayer(false).GetLayer() != nullptr); +} + +bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const +{ + // Subtract host memory from total VRAM, since it's shared with the CPU + usageInMB = (m_device->currentAllocatedSize() - m_memoryManager->GetHostAllocationSize()) / 1024 / 1024; + totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; + + return true; +} + +void MetalRenderer::ClearColorbuffer(bool padView) +{ + if (!AcquireDrawable(!padView)) + return; + + ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); +} + +void MetalRenderer::DrawEmptyFrame(bool mainWindow) +{ + if (!BeginFrame(mainWindow)) + return; + SwapBuffers(mainWindow, !mainWindow); +} + +void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) +{ + if (swapTV) + SwapBuffer(true); + if (swapDRC) + SwapBuffer(false); + + // Reset the command buffers (they are released by TemporaryBufferAllocator) + CommitCommandBuffer(); + + // Debug + m_performanceMonitor.ResetPerFrameData(); + + // GPU capture + if (m_capturing) + { + EndCapture(); + } + else if (m_captureFrame) + { + StartCapture(); + m_captureFrame = false; + } +} + +void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { + const bool hasScreenshotRequest = gui_hasScreenshotRequest(); + if (!hasScreenshotRequest && m_screenshot_state == ScreenshotState::None) + return; + + if (m_mainLayer.GetDrawable()) + { + // we already took a pad view screenshow and want a main window screenshot + if (m_screenshot_state == ScreenshotState::Main && padView) + return; + + if (m_screenshot_state == ScreenshotState::Pad && !padView) + return; + + // remember which screenshot is left to take + if (m_screenshot_state == ScreenshotState::None) + m_screenshot_state = padView ? ScreenshotState::Main : ScreenshotState::Pad; + else + m_screenshot_state = ScreenshotState::None; + } + else + m_screenshot_state = ScreenshotState::None; + + auto texMtl = static_cast(texView->baseTexture); + + int width, height; + texMtl->GetEffectiveSize(width, height, 0); + + uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width); + uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow); + + // TODO: get a buffer from the memory manager + MTL::Buffer* buffer = m_device->newBuffer(size, MTL::ResourceStorageModeShared); + + auto blitCommandEncoder = GetBlitCommandEncoder(); + blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer, 0, bytesPerRow, 0); + + uint8* bufferPtr = (uint8*)buffer->contents(); + + bool formatValid = true; + std::vector rgb_data; + rgb_data.reserve(3 * width * height); + + auto pixelFormat = texMtl->GetTexture()->pixelFormat(); + // TODO: implement more formats + switch (pixelFormat) + { + case MTL::PixelFormatRGBA8Unorm: + for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + { + rgb_data.emplace_back(*ptr); + rgb_data.emplace_back(*(ptr + 1)); + rgb_data.emplace_back(*(ptr + 2)); + } + break; + case MTL::PixelFormatRGBA8Unorm_sRGB: + for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) + { + rgb_data.emplace_back(SRGBComponentToRGB(*ptr)); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1))); + rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 2))); + } + break; + default: + cemuLog_log(LogType::Force, "Unsupported screenshot texture pixel format {}", pixelFormat); + formatValid = false; + break; + } + + buffer->release(); + + if (formatValid) + SaveScreenshot(rgb_data, width, height, !padView); +} + +void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) +{ + if (!AcquireDrawable(!padView)) + return; + + MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); + + // Create render pass + auto& layer = GetLayer(!padView); + + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); + + // Get a render pipeline + + // Find out which shader we are using + uint8 shaderIndex = 255; + if (shader == RendererOutputShader::s_copy_shader) shaderIndex = 0; + else if (shader == RendererOutputShader::s_bicubic_shader) shaderIndex = 1; + else if (shader == RendererOutputShader::s_hermit_shader) shaderIndex = 2; + else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; + else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; + else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; + + uint8 shaderType = shaderIndex % 3; + + // Get the render pipeline state + auto renderPipelineState = m_outputShaderCache->GetPipeline(shader, shaderIndex, m_state.m_usesSRGB); + + // Draw to Metal layer + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + renderCommandEncoder->setFragmentTexture(presentTexture, 0); + renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); + + // Set uniforms + float outputSize[2] = {(float)imageWidth, (float)imageHeight}; + switch (shaderType) + { + case 2: + renderCommandEncoder->setFragmentBytes(outputSize, sizeof(outputSize), 0); + break; + default: + break; + } + + renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0}); + renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight}); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); + + EndEncoding(); +} + +bool MetalRenderer::BeginFrame(bool mainWindow) +{ + return AcquireDrawable(mainWindow); +} + +void MetalRenderer::Flush(bool waitIdle) +{ + if (m_recordedDrawcalls > 0 || waitIdle) + CommitCommandBuffer(); + + if (waitIdle && m_executingCommandBuffers.size() != 0) + m_executingCommandBuffers.back()->waitUntilCompleted(); +} + +void MetalRenderer::NotifyLatteCommandProcessorIdle() +{ + //if (m_commitOnIdle) + // CommitCommandBuffer(); +} + +bool MetalRenderer::ImguiBegin(bool mainWindow) +{ + if (!Renderer::ImguiBegin(mainWindow)) + return false; + + if (!AcquireDrawable(mainWindow)) + return false; + + EnsureImGuiBackend(); + + // Check if the font texture needs to be built + ImGuiIO& io = ImGui::GetIO(); + if (!io.Fonts->IsBuilt()) + ImGui_ImplMetal_CreateFontsTexture(m_device); + + auto& layer = GetLayer(mainWindow); + + // Render pass descriptor + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(layer.GetDrawable()->texture()); + colorAttachment->setLoadAction(MTL::LoadActionLoad); + colorAttachment->setStoreAction(MTL::StoreActionStore); + + // New frame + ImGui_ImplMetal_NewFrame(renderPassDescriptor); + ImGui_UpdateWindowInformation(mainWindow); + ImGui::NewFrame(); + + if (m_encoderType != MetalEncoderType::Render) + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); + + return true; +} + +void MetalRenderer::ImguiEnd() +{ + EnsureImGuiBackend(); + + if (m_encoderType != MetalEncoderType::Render) + { + cemuLog_logOnce(LogType::Force, "no render command encoder, cannot draw ImGui"); + return; + } + + ImGui::Render(); + ImGui_ImplMetal_RenderDrawData(ImGui::GetDrawData(), GetCurrentCommandBuffer(), (MTL::RenderCommandEncoder*)m_commandEncoder); + //ImGui::EndFrame(); + + EndEncoding(); +} + +ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const Vector2i& size) +{ + try + { + std::vector tmp(size.x * size.y * 4); + for (size_t i = 0; i < data.size() / 3; ++i) + { + tmp[(i * 4) + 0] = data[(i * 3) + 0]; + tmp[(i * 4) + 1] = data[(i * 3) + 1]; + tmp[(i * 4) + 2] = data[(i * 3) + 2]; + tmp[(i * 4) + 3] = 0xFF; + } + + MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); + desc->setTextureType(MTL::TextureType2D); + desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); + desc->setWidth(size.x); + desc->setHeight(size.y); + desc->setStorageMode(m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModeManaged); + desc->setUsage(MTL::TextureUsageShaderRead); + + MTL::Texture* texture = m_device->newTexture(desc); + desc->release(); + + // TODO: do a GPU copy? + texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0); + + return (ImTextureID)texture; + } + catch (const std::exception& ex) + { + cemuLog_log(LogType::Force, "can't generate imgui texture: {}", ex.what()); + return nullptr; + } +} + +void MetalRenderer::DeleteTexture(ImTextureID id) +{ + EnsureImGuiBackend(); + + ((MTL::Texture*)id)->release(); +} + +void MetalRenderer::DeleteFontTextures() +{ + EnsureImGuiBackend(); + + ImGui_ImplMetal_DestroyFontsTexture(); +} + +void MetalRenderer::AppendOverlayDebugInfo() +{ + ImGui::Text("--- GPU info ---"); + ImGui::Text("GPU %s", m_device->name()->utf8String()); + ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); + ImGui::Text("Supports framebuffer fetch %s", (m_supportsFramebufferFetch ? "yes" : "no")); + ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); + ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); + + ImGui::Text("--- Metal info ---"); + ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); + + ImGui::Text("--- Metal info (per frame) ---"); + ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); + ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); + ImGui::Text("Clears %u", m_performanceMonitor.m_clears); + ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); + ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); + + ImGui::Text("--- Cache debug info ---"); + + uint32 bufferCacheHeapSize = 0; + uint32 bufferCacheAllocationSize = 0; + uint32 bufferCacheNumAllocations = 0; + + LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations); + + ImGui::Text("Buffer"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations); + + uint32 numBuffers; + size_t totalSize, freeSize; + + m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Staging"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); + + m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); + ImGui::Text("Index"); + ImGui::SameLine(60.0f); + ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); +} + +void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) +{ + // halfZ is handled in the shader + + m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; +} + +void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) +{ + m_state.m_scissor = MTL::ScissorRect{(uint32)scissorX, (uint32)scissorY, (uint32)scissorWidth, (uint32)scissorHeight}; +} + +LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) +{ + return new CachedFBOMtl(this, key); +} + +void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) +{ + if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo) + m_state.m_activeFBO = {nullptr}; +} + +void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) +{ + m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)}; +} + +void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) +{ + return m_memoryManager->AcquireTextureUploadBuffer(size); +} + +void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) +{ + m_memoryManager->ReleaseTextureUploadBuffer(mem); +} + +TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) +{ + return GetMtlPixelFormatInfo(format, isDepth).textureDecoder; +} + +void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) +{ + if (hostTexture->isDepth) + { + texture_clearDepthSlice(hostTexture, sliceIndex, mipIndex, true, hostTexture->hasStencil, 0.0f, 0); + } + else + { + texture_clearColorSlice(hostTexture, sliceIndex, mipIndex, 0.0f, 0.0f, 0.0f, 0.0f); + } +} + +// TODO: do a cpu copy on Apple Silicon? +void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) +{ + auto textureMtl = (LatteTextureMtl*)hostTexture; + + uint32 offsetZ = 0; + if (textureMtl->Is3DTexture()) + { + offsetZ = sliceIndex; + sliceIndex = 0; + } + + size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->format, textureMtl->isDepth, width); + // No need to set bytesPerImage for 3D textures, since we always load just one slice + //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->isDepth, height, bytesPerRow); + //if (m_isAppleGPU) + //{ + // textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); + //} + //else + //{ + auto blitCommandEncoder = GetBlitCommandEncoder(); + + // Allocate a temporary buffer + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); + bufferAllocator.FlushReservation(allocation); + + // Copy the data to the temporary buffer + memcpy(allocation.memPtr, pixelData, compressedImageSize); + //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); + + // TODO: specify blit options when copying to a depth stencil texture? + // Copy the data from the temporary buffer to the texture + blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); + //} +} + +void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ + if (!FormatIsRenderable(hostTexture->format)) + { + cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); + return; + } + + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a); +} + +void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) +{ + clearStencil = (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil); + if (!clearDepth && !clearStencil) + { + cemuLog_logOnce(LogType::Force, "skipping depth/stencil clear"); + return; + } + + auto mtlTexture = static_cast(hostTexture)->GetTexture(); + + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + if (clearDepth) + { + auto depthAttachment = renderPassDescriptor->depthAttachment(); + depthAttachment->setTexture(mtlTexture); + depthAttachment->setClearDepth(depthValue); + depthAttachment->setLoadAction(MTL::LoadActionClear); + depthAttachment->setStoreAction(MTL::StoreActionStore); + depthAttachment->setSlice(sliceIndex); + depthAttachment->setLevel(mipIndex); + } + if (clearStencil) + { + auto stencilAttachment = renderPassDescriptor->stencilAttachment(); + stencilAttachment->setTexture(mtlTexture); + stencilAttachment->setClearStencil(stencilValue); + stencilAttachment->setLoadAction(MTL::LoadActionClear); + stencilAttachment->setStoreAction(MTL::StoreActionStore); + stencilAttachment->setSlice(sliceIndex); + stencilAttachment->setLevel(mipIndex); + } + + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); + EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; +} + +LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) +{ + return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); +} + +void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) +{ + m_state.m_textures[textureUnit] = static_cast(textureView); +} + +void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) +{ + // Source size seems to apply to the destination texture as well, therefore we need to adjust it when block size doesn't match + Uvec2 srcBlockTexelSize = GetMtlPixelFormatInfo(src->format, src->isDepth).blockTexelSize; + Uvec2 dstBlockTexelSize = GetMtlPixelFormatInfo(dst->format, dst->isDepth).blockTexelSize; + if (srcBlockTexelSize.x != dstBlockTexelSize.x || srcBlockTexelSize.y != dstBlockTexelSize.y) + { + uint32 multX = (srcBlockTexelSize.x > dstBlockTexelSize.x ? srcBlockTexelSize.x / dstBlockTexelSize.x : dstBlockTexelSize.x / srcBlockTexelSize.x); + effectiveCopyWidth *= multX; + + uint32 multY = (srcBlockTexelSize.y > dstBlockTexelSize.y ? srcBlockTexelSize.y / dstBlockTexelSize.y : dstBlockTexelSize.y / srcBlockTexelSize.y); + effectiveCopyHeight *= multY; + } + + auto blitCommandEncoder = GetBlitCommandEncoder(); + + auto mtlSrc = static_cast(src)->GetTexture(); + auto mtlDst = static_cast(dst)->GetTexture(); + + uint32 srcBaseLayer = 0; + uint32 dstBaseLayer = 0; + uint32 srcOffsetZ = 0; + uint32 dstOffsetZ = 0; + uint32 srcLayerCount = 1; + uint32 dstLayerCount = 1; + uint32 srcDepth = 1; + uint32 dstDepth = 1; + + if (src->Is3DTexture()) + { + srcOffsetZ = srcSlice; + srcDepth = srcDepth_; + } + else + { + srcBaseLayer = srcSlice; + srcLayerCount = srcDepth_; + } + + if (dst->Is3DTexture()) + { + dstOffsetZ = dstSlice; + dstDepth = srcDepth_; + } + else + { + dstBaseLayer = dstSlice; + dstLayerCount = srcDepth_; + } + + // If copying whole textures, we can do a more efficient copy + if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 && + srcOffsetZ == 0 && dstOffsetZ == 0 && + effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && srcDepth == src->GetMipDepth(srcMip) && + effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && dstDepth == dst->GetMipDepth(dstMip) && + srcLayerCount == dstLayerCount) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, mtlDst, dstBaseLayer, dstMip, srcLayerCount, 1); + } + else + { + if (srcLayerCount == dstLayerCount) + { + for (uint32 i = 0; i < srcLayerCount; i++) + { + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, srcDepth), mtlDst, dstBaseLayer + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + else + { + for (uint32 i = 0; i < std::max(srcLayerCount, dstLayerCount); i++) + { + if (srcLayerCount == 1) + srcOffsetZ++; + else + srcSlice++; + + if (dstLayerCount == 1) + dstOffsetZ++; + else + dstSlice++; + + blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstBaseLayer, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); + } + } + } +} + +LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) +{ + size_t uploadSize = static_cast(textureView->baseTexture)->GetTexture()->allocatedSize(); + + if ((m_readbackBufferWriteOffset + uploadSize) > TEXTURE_READBACK_SIZE) + { + m_readbackBufferWriteOffset = 0; + } + + auto* result = new LatteTextureReadbackInfoMtl(this, textureView, m_readbackBufferWriteOffset); + m_readbackBufferWriteOffset += uploadSize; + + return result; +} + +void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) +{ + // scale copy size to effective size + sint32 effectiveCopyWidth = width; + sint32 effectiveCopyHeight = height; + LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0); + //sint32 sourceEffectiveWidth, sourceEffectiveHeight; + //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); + + texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1); +} + +void MetalRenderer::bufferCache_init(const sint32 bufferSize) +{ + m_memoryManager->InitBufferCache(bufferSize); +} + +void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) +{ + m_memoryManager->UploadToBufferCache(buffer, bufferOffset, size); +} + +void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ + m_memoryManager->CopyBufferCache(srcOffset, dstOffset, size); +} + +void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) +{ + if (m_memoryManager->UseHostMemoryForCache()) + dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); + + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); +} + +void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) +{ + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); + + m_state.m_vertexBufferOffsets[bufferIndex] = offset; +} + +void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) +{ + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; +} + +RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) +{ + return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); +} + +void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) +{ + m_state.m_streamoutState.buffers[bufferIndex].enabled = true; + m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset; +} + +void MetalRenderer::streamout_begin() +{ + // Do nothing +} + +void MetalRenderer::streamout_rendererFinishDrawcall() +{ + // Do nothing +} + +void MetalRenderer::draw_beginSequence() +{ + m_state.m_skipDrawSequence = false; + + bool streamoutEnable = LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0; + + // update shader state + LatteSHRC_UpdateActiveShaders(); + if (LatteGPUState.activeShaderHasError) + { + cemuLog_logOnce(LogType::Force, "Skipping drawcalls due to shader error\n"); + m_state.m_skipDrawSequence = true; + cemu_assert_debug(false); + return; + } + + // update render target and texture state + LatteGPUState.requiresTextureBarrier = false; + while (true) + { + LatteGPUState.repeatTextureInitialization = false; + if (!LatteMRT::UpdateCurrentFBO()) + { + cemuLog_logOnce(LogType::Force, "Rendertarget invalid\n"); + m_state.m_skipDrawSequence = true; + return; // no render target + } + + if (!hasValidFramebufferAttached && !streamoutEnable) + { + cemuLog_logOnce(LogType::Force, "Drawcall with no color buffer or depth buffer attached\n"); + m_state.m_skipDrawSequence = true; + return; // no render target + } + LatteTexture_updateTextures(); + if (!LatteGPUState.repeatTextureInitialization) + break; + } + + // apply render target + LatteMRT::ApplyCurrentState(); + + // viewport and scissor box + LatteRenderTarget_updateViewport(); + LatteRenderTarget_updateScissorBox(); + + // check for conditions which would turn the drawcalls into no-ops + bool rasterizerEnable = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); + + // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly + // for now we use a workaround: + if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) + rasterizerEnable = true; + + if (!rasterizerEnable && !streamoutEnable) + m_state.m_skipDrawSequence = true; +} + +void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) +{ + if (m_state.m_skipDrawSequence) + { + LatteGPUState.drawCallCounter++; + return; + } + + // fast clear color as depth + if (LatteGPUState.contextNew.GetSpecialStateValues()[8] != 0) + { + LatteDraw_handleSpecialState8_clearAsDepth(); + LatteGPUState.drawCallCounter++; + return; + } + else if (LatteGPUState.contextNew.GetSpecialStateValues()[5] != 0) + { + draw_handleSpecialState5(); + LatteGPUState.drawCallCounter++; + return; + } + + auto& encoderState = m_state.m_encoderState; + + // Shaders + LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); + LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + const auto fetchShader = LatteSHRC_GetActiveFetchShader(); + + /* + bool neverSkipAccurateBarrier = false; + + // "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option) + // We always force accurate barriers for known problematic shaders + if (pixelShader) + { + if (pixelShader->baseHash == 0x6f6f6e7b9aae57af && pixelShader->auxHash == 0x00078787f9249249) // BotW lava + neverSkipAccurateBarrier = true; + if (pixelShader->baseHash == 0x4c0bd596e3aef4a6 && pixelShader->auxHash == 0x003c3c3fc9269249) // BotW foam layer for water on the bottom of waterfalls + neverSkipAccurateBarrier = true; + } + + // Check if we need to end the render pass + if (!m_state.m_isFirstDrawInRenderPass && (GetConfig().vk_accurate_barriers || neverSkipAccurateBarrier)) + { + // Fragment shader is most likely to require a render pass flush, so check for it first + bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); + if (!endRenderPass) + endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); + if (!endRenderPass && geometryShader) + endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); + + if (endRenderPass) + { + EndEncoding(); + // TODO: only log in debug? + cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n"); + } + } + */ + + // Primitive type + const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); + auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); + bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); + + bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); + bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); + + // Index buffer + Renderer::INDEX_TYPE hostIndexType; + uint32 hostIndexCount; + uint32 indexMin = 0; + uint32 indexMax = 0; + Renderer::IndexAllocation indexAllocation; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); + auto indexAllocationMtl = static_cast(indexAllocation.rendererInternal); + + // Buffer cache + if (m_memoryManager->UseHostMemoryForCache()) + { + // direct memory access (Wii U memory space imported as a buffer), update buffer bindings + draw_updateVertexBuffersDirectAccess(); + if (vertexShader) + draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); + if (geometryShader) + draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); + if (pixelShader) + draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); + } + else + { + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + } + + // Render pass + auto renderCommandEncoder = GetRenderCommandEncoder(); + + // Render pipeline state + PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew); + if (!pipelineObj->m_pipeline) + return; + + if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState) + { + renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline); + encoderState.m_renderPipelineState = pipelineObj->m_pipeline; + } + + // Depth stencil state + + // Disable depth write when there is no depth attachment + auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; + bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); + if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture) + depthControl.set_Z_WRITE_ENABLE(false); + + MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); + if (depthStencilState != encoderState.m_depthStencilState) + { + renderCommandEncoder->setDepthStencilState(depthStencilState); + encoderState.m_depthStencilState = depthStencilState; + } + + // Restore the original depth write state + depthControl.set_Z_WRITE_ENABLE(depthWriteEnable); + + // Stencil reference + bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); + if (stencilEnable) + { + bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); + uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); + uint32 stencilRefBack; + if (backStencilEnable) + stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); + else + stencilRefBack = stencilRefFront; + + if (stencilRefFront != encoderState.m_stencilRefFront || stencilRefBack != encoderState.m_stencilRefBack) + { + renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); + + encoderState.m_stencilRefFront = stencilRefFront; + encoderState.m_stencilRefBack = stencilRefBack; + } + } + + // Blend color + uint32* blendColorConstantU32 = LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + + if (blendColorConstantU32[0] != encoderState.m_blendColor[0] || blendColorConstantU32[1] != encoderState.m_blendColor[1] || blendColorConstantU32[2] != encoderState.m_blendColor[2] || blendColorConstantU32[3] != encoderState.m_blendColor[3]) + { + float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; + renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); + + encoderState.m_blendColor[0] = blendColorConstantU32[0]; + encoderState.m_blendColor[1] = blendColorConstantU32[1]; + encoderState.m_blendColor[2] = blendColorConstantU32[2]; + encoderState.m_blendColor[3] = blendColorConstantU32[3]; + } + + // polygon control + const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; + const auto frontFace = polygonControlReg.get_FRONT_FACE(); + uint32 cullFront = polygonControlReg.get_CULL_FRONT(); + uint32 cullBack = polygonControlReg.get_CULL_BACK(); + uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED(); + + if (polyOffsetFrontEnable) + { + uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); + uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); + uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); + + if (frontOffsetU32 != encoderState.m_depthBias || frontScaleU32 != encoderState.m_depthSlope || offsetClampU32 != encoderState.m_depthClamp) + { + float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); + float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); + float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); + + frontScale /= 16.0f; + + renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); + + encoderState.m_depthBias = frontOffsetU32; + encoderState.m_depthSlope = frontScaleU32; + encoderState.m_depthClamp = offsetClampU32; + } + } + else + { + if (0 != encoderState.m_depthBias || 0 != encoderState.m_depthSlope || 0 != encoderState.m_depthClamp) + { + renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); + + encoderState.m_depthBias = 0; + encoderState.m_depthSlope = 0; + encoderState.m_depthClamp = 0; + } + } + + // Depth clip mode + cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually + bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; + + if (zClipEnable != encoderState.m_depthClipEnable) + { + renderCommandEncoder->setDepthClipMode(zClipEnable ? MTL::DepthClipModeClip : MTL::DepthClipModeClamp); + encoderState.m_depthClipEnable = zClipEnable; + } + + // Visibility result mode + if (m_occlusionQuery.m_active) + { + auto mode = (m_occlusionQuery.m_currentIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); + renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_currentIndex * sizeof(uint64)); + } + + // todo - how does culling behave with rects? + // right now we just assume that their winding is always CW + if (isPrimitiveRect) + { + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CW) + cullFront = cullBack; + else + cullBack = cullFront; + } + + // Cull mode + + // Cull front and back is handled by disabling rasterization + if (!(cullFront && cullBack)) + { + MTL::CullMode cullMode; + if (cullFront) + cullMode = MTL::CullModeFront; + else if (cullBack) + cullMode = MTL::CullModeBack; + else + cullMode = MTL::CullModeNone; + + if (cullMode != encoderState.m_cullMode) + { + renderCommandEncoder->setCullMode(cullMode); + encoderState.m_cullMode = cullMode; + } + } + + // Front face + MTL::Winding frontFaceWinding; + if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW) + frontFaceWinding = MTL::WindingCounterClockwise; + else + frontFaceWinding = MTL::WindingClockwise; + + if (frontFaceWinding != encoderState.m_frontFaceWinding) + { + renderCommandEncoder->setFrontFacingWinding(frontFaceWinding); + encoderState.m_frontFaceWinding = frontFaceWinding; + } + + // Viewport + if (m_state.m_viewport.originX != encoderState.m_viewport.originX || + m_state.m_viewport.originY != encoderState.m_viewport.originY || + m_state.m_viewport.width != encoderState.m_viewport.width || + m_state.m_viewport.height != encoderState.m_viewport.height || + m_state.m_viewport.znear != encoderState.m_viewport.znear || + m_state.m_viewport.zfar != encoderState.m_viewport.zfar) + { + renderCommandEncoder->setViewport(m_state.m_viewport); + + encoderState.m_viewport = m_state.m_viewport; + } + + // Scissor + if (m_state.m_scissor.x != encoderState.m_scissor.x || + m_state.m_scissor.y != encoderState.m_scissor.y || + m_state.m_scissor.width != encoderState.m_scissor.width || + m_state.m_scissor.height != encoderState.m_scissor.height) + { + encoderState.m_scissor = m_state.m_scissor; + + // TODO: clamp scissor to render target dimensions? + //scissor.width = ; + //scissor.height = ; + renderCommandEncoder->setScissorRect(encoderState.m_scissor); + } + + // Resources + + // Vertex buffers + for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + { + size_t offset = m_state.m_vertexBufferOffsets[i]; + if (offset != INVALID_OFFSET) + { + // Bind + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + } + } + + // Prepare streamout + m_state.m_streamoutState.verticesPerInstance = count; + LatteStreamout_PrepareDrawcall(count, instanceCount); + + // Uniform buffers, textures and samplers + BindStageResources(renderCommandEncoder, vertexShader, usesGeometryShader); + if (geometryShader) + BindStageResources(renderCommandEncoder, geometryShader, usesGeometryShader); + BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader); + + // Draw + if (usesGeometryShader) + { + if (hostIndexType != INDEX_TYPE::NONE) + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding); + + uint8 hostIndexTypeU8 = (uint8)hostIndexType; + renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); + encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; + + uint32 verticesPerPrimitive = 0; + switch (primitiveMode) + { + case LattePrimitiveMode::POINTS: + verticesPerPrimitive = 1; + break; + case LattePrimitiveMode::LINES: + verticesPerPrimitive = 2; + break; + case LattePrimitiveMode::TRIANGLES: + case LattePrimitiveMode::RECTS: + verticesPerPrimitive = 3; + break; + default: + cemuLog_log(LogType::Force, "unimplemented geometry shader primitive mode {}", (uint32)primitiveMode); + break; + } + + renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count * instanceCount / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); + } + else + { + if (hostIndexType != INDEX_TYPE::NONE) + { + auto mtlIndexType = GetMtlIndexType(hostIndexType); + renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance); + } + else + { + renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); + } + } + + m_state.m_isFirstDrawInRenderPass = false; + + // Occlusion queries + if (m_occlusionQuery.m_active) + m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; + + // Streamout + LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); + + // Debug + if (fetchVertexManually) + m_performanceMonitor.m_manualVertexFetchDraws++; + if (usesGeometryShader) + m_performanceMonitor.m_meshDraws++; + if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN) + m_performanceMonitor.m_triangleFans++; + + LatteGPUState.drawCallCounter++; +} + +void MetalRenderer::draw_endSequence() +{ + LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); + // post-drawcall logic + if (pixelShader) + LatteRenderTarget_trackUpdates(); + bool hasReadback = LatteTextureReadback_Update(); + m_recordedDrawcalls++; + // The number of draw calls needs to twice as big, since we are interrupting the render pass + // TODO: ucomment? + if (m_recordedDrawcalls >= m_commitTreshold * 2/* || hasReadback*/) + { + CommitCommandBuffer(); + + // TODO: where should this be called? + LatteTextureReadback_UpdateFinishedTransfers(false); + } +} + +void MetalRenderer::draw_updateVertexBuffersDirectAccess() +{ + LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); + if (!parsedFetchShader) + return; + + for (auto& bufferGroup : parsedFetchShader->bufferGroups) + { + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; + + if (bufferAddress == MPTR_NULL) [[unlikely]] + bufferAddress = m_memoryManager->GetImportedMemBaseAddress(); + + m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); + } +} + +void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) +{ + if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (const auto& buf : shader->list_quickBufferList) + { + sint32 i = buf.index; + MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; + uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; + + if (physicalAddr == MPTR_NULL) [[unlikely]] + { + cemu_assert_unimplemented(); + continue; + } + uniformSize = std::min(uniformSize, buf.size); + + cemu_assert_debug(physicalAddr < 0x50000000); + + uint32 bufferIndex = i; + cemu_assert_debug(bufferIndex < 16); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); + } + } +} + +void MetalRenderer::draw_handleSpecialState5() +{ + LatteMRT::UpdateCurrentFBO(); + LatteRenderTarget_updateViewport(); + + LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); + LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); + auto colorTextureMtl = static_cast(colorBuffer); + auto depthTextureMtl = static_cast(depthBuffer); + + sint32 vpWidth, vpHeight; + LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); + + // Get the pipeline + MTL::PixelFormat colorPixelFormat = colorTextureMtl->GetRGBAView()->pixelFormat(); + auto& pipeline = m_copyDepthToColorPipelines[colorPixelFormat]; + if (!pipeline) + { + m_copyDepthToColorDesc->colorAttachments()->object(0)->setPixelFormat(colorPixelFormat); + + NS::Error* error = nullptr; + pipeline = m_device->newRenderPipelineState(m_copyDepthToColorDesc, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); + } + } + + // Sadly, we need to end encoding to ensure that the depth data is up-to-date + EndEncoding(); + + // Copy depth to color + auto renderCommandEncoder = GetRenderCommandEncoder(); + + auto& encoderState = m_state.m_encoderState; + + renderCommandEncoder->setRenderPipelineState(pipeline); + // TODO: make a helper function for this + encoderState.m_renderPipelineState = pipeline; + SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, depthTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); + // TODO: make a helper function for this + renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0)); + encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); +} + +Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size) +{ + auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128); + + return {allocation->memPtr, allocation}; +} + +void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) +{ + m_memoryManager->GetIndexAllocator().FreeReservation(static_cast(allocation.rendererInternal)); +} + +void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) +{ + m_memoryManager->GetIndexAllocator().FlushReservation(static_cast(allocation.rendererInternal)); +} + +LatteQueryObject* MetalRenderer::occlusionQuery_create() { + return new LatteQueryObjectMtl(this); +} + +void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { + auto queryObjMtl = static_cast(queryObj); + delete queryObjMtl; +} + +void MetalRenderer::occlusionQuery_flush() { + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted(); +} + +void MetalRenderer::occlusionQuery_updateState() { + ProcessFinishedCommandBuffers(); +} + +void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) +{ + auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index]; + if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset) + return; + + if (buffer == boundBuffer.m_buffer) + { + // Update just the offset + boundBuffer.m_offset = offset; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBufferOffset(offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBufferOffset(offset, index); + break; + } + + return; + } + + boundBuffer = {buffer, offset}; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshBuffer(buffer, offset, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentBuffer(buffer, offset, index); + break; + } +} + +void MetalRenderer::SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) +{ + auto& boundTexture = m_state.m_encoderState.m_textures[shaderType][index]; + if (texture == boundTexture) + return; + + boundTexture = texture; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexTexture(texture, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectTexture(texture, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshTexture(texture, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentTexture(texture, index); + break; + } +} + +void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) +{ + auto& boundSamplerState = m_state.m_encoderState.m_samplers[shaderType][index]; + if (samplerState == boundSamplerState) + return; + + boundSamplerState = samplerState; + + switch (shaderType) + { + case METAL_SHADER_TYPE_VERTEX: + renderCommandEncoder->setVertexSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_OBJECT: + renderCommandEncoder->setObjectSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_MESH: + renderCommandEncoder->setMeshSamplerState(samplerState, index); + break; + case METAL_SHADER_TYPE_FRAGMENT: + renderCommandEncoder->setFragmentSamplerState(samplerState, index); + break; + } +} + +MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() +{ + bool needsNewCommandBuffer = (!m_currentCommandBuffer.m_commandBuffer || m_currentCommandBuffer.m_commited); + if (needsNewCommandBuffer) + { + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); + + MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); + m_currentCommandBuffer = {mtlCommandBuffer}; + + // Wait for the previous command buffer + if (m_eventValue != -1) + mtlCommandBuffer->encodeWait(m_event, m_eventValue); + + m_recordedDrawcalls = 0; + m_commitTreshold = m_defaultCommitTreshlod; + + // Debug + m_performanceMonitor.m_commandBuffers++; + + return mtlCommandBuffer; + } + else + { + return m_currentCommandBuffer.m_commandBuffer; + } +} + +MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + EndEncoding(); + + auto commandBuffer = GetCommandBuffer(); + + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Temporary render command encoder", renderCommandEncoder)); +#endif + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + // Debug + m_performanceMonitor.m_renderPasses++; + + return renderCommandEncoder; +} + +// Some render passes clear the attachments, forceRecreate is supposed to be used in those cases +MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate) +{ + // Check if we need to begin a new render pass + if (m_commandEncoder) + { + if (!forceRecreate) + { + if (m_encoderType == MetalEncoderType::Render) + { + bool needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); + if (!needsNewRenderPass) + { + for (uint8 i = 0; i < 8; i++) + { + if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) + { + needsNewRenderPass = true; + break; + } + } + } + + if (!needsNewRenderPass) + { + if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) + { + needsNewRenderPass = true; + } + } + + if (!needsNewRenderPass) + { + return (MTL::RenderCommandEncoder*)m_commandEncoder; + } + } + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor()); +#ifdef CEMU_DEBUG_ASSERT + renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); +#endif + m_commandEncoder = renderCommandEncoder; + m_encoderType = MetalEncoderType::Render; + + // Update state + m_state.m_lastUsedFBO = m_state.m_activeFBO; + m_state.m_isFirstDrawInRenderPass = true; + + ResetEncoderState(); + + // Debug + m_performanceMonitor.m_renderPasses++; + + return renderCommandEncoder; +} + +MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType == MetalEncoderType::Compute) + { + return (MTL::ComputeCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto computeCommandEncoder = commandBuffer->computeCommandEncoder(); + m_commandEncoder = computeCommandEncoder; + m_encoderType = MetalEncoderType::Compute; + + ResetEncoderState(); + + return computeCommandEncoder; +} + +MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() +{ + if (m_commandEncoder) + { + if (m_encoderType == MetalEncoderType::Blit) + { + return (MTL::BlitCommandEncoder*)m_commandEncoder; + } + + EndEncoding(); + } + + auto commandBuffer = GetCommandBuffer(); + + auto blitCommandEncoder = commandBuffer->blitCommandEncoder(); + m_commandEncoder = blitCommandEncoder; + m_encoderType = MetalEncoderType::Blit; + + ResetEncoderState(); + + return blitCommandEncoder; +} + +void MetalRenderer::EndEncoding() +{ + if (m_commandEncoder) + { + m_commandEncoder->endEncoding(); + //m_commandEncoder->release(); + m_commandEncoder = nullptr; + m_encoderType = MetalEncoderType::None; + + // Commit the command buffer if enough draw calls have been recorded + if (m_recordedDrawcalls >= m_commitTreshold) + CommitCommandBuffer(); + } +} + +void MetalRenderer::CommitCommandBuffer() +{ + if (!m_currentCommandBuffer.m_commandBuffer) + return; + + EndEncoding(); + + ProcessFinishedCommandBuffers(); + + // Commit the command buffer + if (!m_currentCommandBuffer.m_commited) + { + // Handled differently, since it seems like Metal doesn't always call the completion handler + //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { + // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); + //}); + + // Signal event + m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP; + auto mtlCommandBuffer = m_currentCommandBuffer.m_commandBuffer; + mtlCommandBuffer->encodeSignalEvent(m_event, m_eventValue); + + mtlCommandBuffer->commit(); + m_currentCommandBuffer.m_commited = true; + + m_executingCommandBuffers.push_back(mtlCommandBuffer); + + // Debug + //m_commandQueue->insertDebugCaptureBoundary(); + } +} + +void MetalRenderer::ProcessFinishedCommandBuffers() +{ + // Check for finished command buffers + for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) + { + auto commandBuffer = *it; + if (CommandBufferCompleted(commandBuffer)) + { + m_memoryManager->CleanupBuffers(commandBuffer); + commandBuffer->release(); + it = m_executingCommandBuffers.erase(it); + } + else + { + ++it; + } + } +} + +bool MetalRenderer::AcquireDrawable(bool mainWindow) +{ + auto& layer = GetLayer(mainWindow); + if (!layer.GetLayer()) + return false; + + const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; + if (latteBufferUsesSRGB != m_state.m_usesSRGB) + { + layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); + m_state.m_usesSRGB = latteBufferUsesSRGB; + } + + return layer.AcquireDrawable(); +} + +/* +bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) +{ + sint32 textureCount = shader->resourceMapping.getTextureCount(); + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + + // Texture is accessed as a framebuffer fetch, therefore there is no need to flush it + if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + + auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } + + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + continue; + + LatteTexture* baseTexture = textureView->baseTexture; + + // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture + for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) + { + auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; + if (colorTarget && colorTarget->baseTexture == baseTexture) + return true; + } + } + + return false; +} +*/ + +void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) +{ + auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader); + + sint32 textureCount = shader->resourceMapping.getTextureCount(); + for (int i = 0; i < textureCount; ++i) + { + const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); + auto hostTextureUnit = relative_textureUnit; + + // Don't bind textures that are accessed with a framebuffer fetch + if (m_supportsFramebufferFetch && shader->textureRenderTargetIndex[relative_textureUnit] != 255) + continue; + + auto textureDim = shader->textureUnitDim[relative_textureUnit]; + auto texUnitRegIndex = hostTextureUnit * 7; + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; + break; + case LatteConst::ShaderType::Pixel: + hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; + break; + case LatteConst::ShaderType::Geometry: + hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; + texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; + break; + default: + UNREACHABLE; + } + + // TODO: correct? + uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; + if (binding >= MAX_MTL_TEXTURES) + { + cemuLog_logOnce(LogType::Force, "invalid texture binding {}", binding); + continue; + } + + auto textureView = m_state.m_textures[hostTextureUnit]; + if (!textureView) + { + if (textureDim == Latte::E_DIM::DIM_1D) + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); + else + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); + SetSamplerState(renderCommandEncoder, mtlShaderType, m_nearestSampler, binding); + continue; + } + + if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D)) + { + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); + continue; + } + else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA)) + { + SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); + continue; + } + + LatteTexture* baseTexture = textureView->baseTexture; + + uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; + MTL::SamplerState* sampler; + if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) + { + uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); + sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); + } + else + { + sampler = m_nearestSampler; + } + SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); + + // get texture register word 0 + uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; + auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; + MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); + SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding); + } + + // Support buffer + auto GET_UNIFORM_DATA_PTR = [&](size_t index) { return supportBufferData + (index / 4); }; + + sint32 shaderAluConst; + sint32 shaderUniformRegisterOffset; + + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + shaderAluConst = 0x400; + shaderUniformRegisterOffset = mmSQ_VTX_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Pixel: + shaderAluConst = 0; + shaderUniformRegisterOffset = mmSQ_PS_UNIFORM_BLOCK_START; + break; + case LatteConst::ShaderType::Geometry: + shaderAluConst = 0; // geometry shader has no ALU const + shaderUniformRegisterOffset = mmSQ_GS_UNIFORM_BLOCK_START; + break; + default: + UNREACHABLE; + } + + if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) + { + if (shader->uniform.list_ufTexRescale.empty() == false) + { + for (auto& entry : shader->uniform.list_ufTexRescale) + { + float* xyScale = LatteTexture_getEffectiveTextureScale(shader->shaderType, entry.texUnit); + memcpy(entry.currentValue, xyScale, sizeof(float) * 2); + memcpy(GET_UNIFORM_DATA_PTR(entry.uniformLocation), xyScale, sizeof(float) * 2); + } + } + if (shader->uniform.loc_alphaTestRef >= 0) + { + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_alphaTestRef) = LatteGPUState.contextNew.SX_ALPHA_REF.get_ALPHA_TEST_REF(); + } + if (shader->uniform.loc_pointSize >= 0) + { + const auto& pointSizeReg = LatteGPUState.contextNew.PA_SU_POINT_SIZE; + float pointWidth = (float)pointSizeReg.get_WIDTH() / 8.0f; + if (pointWidth == 0.0f) + pointWidth = 1.0f / 8.0f; // minimum size + *GET_UNIFORM_DATA_PTR(shader->uniform.loc_pointSize) = pointWidth; + } + if (shader->uniform.loc_remapped >= 0) + { + LatteBufferCache_LoadRemappedUniforms(shader, GET_UNIFORM_DATA_PTR(shader->uniform.loc_remapped)); + } + if (shader->uniform.loc_uniformRegister >= 0) + { + uint32* uniformRegData = (uint32*)(LatteGPUState.contextRegister + mmSQ_ALU_CONSTANT0_0 + shaderAluConst); + memcpy(GET_UNIFORM_DATA_PTR(shader->uniform.loc_uniformRegister), uniformRegData, shader->uniform.count_uniformRegister * 16); + } + if (shader->uniform.loc_windowSpaceToClipSpaceTransform >= 0) + { + sint32 viewportWidth; + sint32 viewportHeight; + LatteRenderTarget_GetCurrentVirtualViewportSize(&viewportWidth, &viewportHeight); // always call after _updateViewport() + float* v = GET_UNIFORM_DATA_PTR(shader->uniform.loc_windowSpaceToClipSpaceTransform); + v[0] = 2.0f / (float)viewportWidth; + v[1] = 2.0f / (float)viewportHeight; + } + if (shader->uniform.loc_fragCoordScale >= 0) + { + LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); + } + if (shader->uniform.loc_verticesPerInstance >= 0) + { + *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance; + for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) + { + if (shader->uniform.loc_streamoutBufferBase[b] >= 0) + { + *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset; + } + } + } + + size_t size = shader->uniform.uniformRangeSize; + auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); + auto allocation = bufferAllocator.AllocateBufferMemory(size, 1); + memcpy(allocation.memPtr, supportBufferData, size); + bufferAllocator.FlushReservation(allocation); + + SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); + } + + // Uniform buffers + for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) + { + if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) + { + uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; + if (binding >= MAX_MTL_BUFFERS) + { + cemuLog_logOnce(LogType::Force, "invalid buffer binding {}", binding); + continue; + } + + size_t offset = m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][i]; + if (offset == INVALID_OFFSET) + continue; + + SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding); + } + } + + // Storage buffer + if (shader->resourceMapping.tfStorageBindingPoint >= 0) + { + SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + } +} + +void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) +{ + MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); + auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); + colorAttachment->setTexture(mtlTexture); + colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); + colorAttachment->setLoadAction(MTL::LoadActionClear); + colorAttachment->setStoreAction(MTL::StoreActionStore); + colorAttachment->setSlice(sliceIndex); + colorAttachment->setLevel(mipIndex); + + GetTemporaryRenderCommandEncoder(renderPassDescriptor); + renderPassDescriptor->release(); + EndEncoding(); + + // Debug + m_performanceMonitor.m_clears++; +} + +void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) +{ + // TODO: uncomment and fix performance issues + // Do the copy in a vertex shader on Apple GPUs + /* + if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_commandEncoder); + + MTL::Resource* barrierBuffers[] = {src}; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex); + + renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); + m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); + + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, src, srcOffset, GET_HELPER_BUFFER_BINDING(0)); + SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dst, dstOffset, GET_HELPER_BUFFER_BINDING(1)); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); + + barrierBuffers[0] = dst; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, before | MTL::RenderStageVertex, before); + } + else + { + */ + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); + //} +} + +void MetalRenderer::SwapBuffer(bool mainWindow) +{ + if (!AcquireDrawable(mainWindow)) + return; + + auto commandBuffer = GetCommandBuffer(); + GetLayer(mainWindow).PresentDrawable(commandBuffer); +} + +void MetalRenderer::EnsureImGuiBackend() +{ + if (!ImGui::GetIO().BackendRendererUserData) + { + ImGui_ImplMetal_Init(m_device); + //ImGui_ImplMetal_CreateFontsTexture(m_device); + } +} + +void MetalRenderer::StartCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + auto desc = MTL::CaptureDescriptor::alloc()->init(); + desc->setCaptureObject(m_device); + + // Check if a debugger with support for GPU capture is attached + if (captureManager->supportsDestination(MTL::CaptureDestinationDeveloperTools)) + { + desc->setDestination(MTL::CaptureDestinationDeveloperTools); + } + else + { + if (GetConfig().gpu_capture_dir.GetValue().empty()) + { + cemuLog_log(LogType::Force, "No GPU capture directory specified, cannot do a GPU capture"); + return; + } + + // Check if the GPU trace document destination is available + if (!captureManager->supportsDestination(MTL::CaptureDestinationGPUTraceDocument)) + { + cemuLog_log(LogType::Force, "GPU trace document destination is not available, cannot do a GPU capture"); + return; + } + + // Get current date and time as a string + auto now = std::chrono::system_clock::now(); + std::time_t now_time = std::chrono::system_clock::to_time_t(now); + std::ostringstream oss; + oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d_%H-%M-%S"); + std::string now_str = oss.str(); + + std::string capturePath = fmt::format("{}/cemu_{}.gputrace", GetConfig().gpu_capture_dir.GetValue(), now_str); + desc->setDestination(MTL::CaptureDestinationGPUTraceDocument); + desc->setOutputURL(ToNSURL(capturePath)); + } + + NS::Error* error = nullptr; + captureManager->startCapture(desc, &error); + if (error) + { + cemuLog_log(LogType::Force, "Failed to start GPU capture: {}", error->localizedDescription()->utf8String()); + } + + m_capturing = true; +} + +void MetalRenderer::EndCapture() +{ + auto captureManager = MTL::CaptureManager::sharedCaptureManager(); + captureManager->stopCapture(); + + m_capturing = false; +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h new file mode 100644 index 000000000..04c63be82 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -0,0 +1,552 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/Renderer.h" + +#include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h" + +enum MetalGeneralShaderType +{ + METAL_GENERAL_SHADER_TYPE_VERTEX, + METAL_GENERAL_SHADER_TYPE_GEOMETRY, + METAL_GENERAL_SHADER_TYPE_FRAGMENT, + + METAL_GENERAL_SHADER_TYPE_TOTAL +}; + +inline MetalGeneralShaderType GetMtlGeneralShaderType(LatteConst::ShaderType shaderType) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + return METAL_GENERAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_GENERAL_SHADER_TYPE_GEOMETRY; + case LatteConst::ShaderType::Pixel: + return METAL_GENERAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_GENERAL_SHADER_TYPE_TOTAL; + } +} + +enum MetalShaderType +{ + METAL_SHADER_TYPE_VERTEX, + METAL_SHADER_TYPE_OBJECT, + METAL_SHADER_TYPE_MESH, + METAL_SHADER_TYPE_FRAGMENT, + + METAL_SHADER_TYPE_TOTAL +}; + +inline MetalShaderType GetMtlShaderType(LatteConst::ShaderType shaderType, bool usesGeometryShader) +{ + switch (shaderType) + { + case LatteConst::ShaderType::Vertex: + if (usesGeometryShader) + return METAL_SHADER_TYPE_OBJECT; + else + return METAL_SHADER_TYPE_VERTEX; + case LatteConst::ShaderType::Geometry: + return METAL_SHADER_TYPE_MESH; + case LatteConst::ShaderType::Pixel: + return METAL_SHADER_TYPE_FRAGMENT; + default: + return METAL_SHADER_TYPE_TOTAL; + } +} + +struct MetalEncoderState +{ + MTL::RenderPipelineState* m_renderPipelineState = nullptr; + MTL::DepthStencilState* m_depthStencilState = nullptr; + MTL::CullMode m_cullMode = MTL::CullModeNone; + MTL::Winding m_frontFaceWinding = MTL::WindingClockwise; + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; + uint32 m_stencilRefFront = 0; + uint32 m_stencilRefBack = 0; + uint32 m_blendColor[4] = {0}; + uint32 m_depthBias = 0; + uint32 m_depthSlope = 0; + uint32 m_depthClamp = 0; + bool m_depthClipEnable = true; + struct { + MTL::Buffer* m_buffer; + size_t m_offset; + } m_buffers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + MTL::Texture* m_textures[METAL_SHADER_TYPE_TOTAL][MAX_MTL_TEXTURES]; + MTL::SamplerState* m_samplers[METAL_SHADER_TYPE_TOTAL][MAX_MTL_SAMPLERS]; +}; + +struct MetalStreamoutState +{ + struct + { + bool enabled; + uint32 ringBufferOffset; + } buffers[LATTE_NUM_STREAMOUT_BUFFER]; + sint32 verticesPerInstance; +}; + +struct MetalActiveFBOState +{ + class CachedFBOMtl* m_fbo = nullptr; + MetalAttachmentsInfo m_attachmentsInfo; +}; + +struct MetalState +{ + MetalEncoderState m_encoderState{}; + + bool m_usesSRGB = false; + + bool m_skipDrawSequence = false; + bool m_isFirstDrawInRenderPass = true; + + MetalActiveFBOState m_activeFBO; + // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change + MetalActiveFBOState m_lastUsedFBO; + + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS]; + class LatteTextureViewMtl* m_textures[LATTE_NUM_MAX_TEX_UNITS * 3] = {nullptr}; + size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; + + MTL::Viewport m_viewport; + MTL::ScissorRect m_scissor; + + MetalStreamoutState m_streamoutState; +}; + +struct MetalCommandBuffer +{ + MTL::CommandBuffer* m_commandBuffer = nullptr; + bool m_commited = false; +}; + +enum class MetalEncoderType +{ + None, + Render, + Compute, + Blit, +}; + +class MetalRenderer : public Renderer +{ +public: + static constexpr uint32 OCCLUSION_QUERY_POOL_SIZE = 1024; + static constexpr uint32 TEXTURE_READBACK_SIZE = 32 * 1024 * 1024; // 32 MB + + struct DeviceInfo + { + std::string name; + uint64 uuid; + }; + + static std::vector GetDevices(); + + MetalRenderer(); + ~MetalRenderer() override; + + RendererAPI GetType() override + { + return RendererAPI::Metal; + } + + static MetalRenderer* GetInstance() { + return static_cast(g_renderer.get()); + } + + // Helper functions + MTL::Device* GetDevice() const { + return m_device; + } + + void InitializeLayer(const Vector2i& size, bool mainWindow); + void ShutdownLayer(bool mainWindow); + void ResizeLayer(const Vector2i& size, bool mainWindow); + + void Initialize() override; + void Shutdown() override; + bool IsPadWindowActive() override; + + bool GetVRAMInfo(int& usageInMB, int& totalInMB) const override; + + void ClearColorbuffer(bool padView) override; + void DrawEmptyFrame(bool mainWindow) override; + void SwapBuffers(bool swapTV, bool swapDRC) override; + + void HandleScreenshotRequest(LatteTextureView* texView, bool padView) override; + + void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + bool padView, bool clearBackground) override; + bool BeginFrame(bool mainWindow) override; + + // flush control + void Flush(bool waitIdle = false) override; // called when explicit flush is required (e.g. by imgui) + void NotifyLatteCommandProcessorIdle() override; // called when command processor has no more commands available or when stalled + + // imgui + bool ImguiBegin(bool mainWindow) override; + void ImguiEnd() override; + ImTextureID GenerateTexture(const std::vector& data, const Vector2i& size) override; + void DeleteTexture(ImTextureID id) override; + void DeleteFontTextures() override; + + bool UseTFViaSSBO() const override { return true; } + void AppendOverlayDebugInfo() override; + + // rendertarget + void renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ = false) override; + void renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) override; + + LatteCachedFBO* rendertarget_createCachedFBO(uint64 key) override; + void rendertarget_deleteCachedFBO(LatteCachedFBO* fbo) override; + void rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) override; + + // texture functions + void* texture_acquireTextureUploadBuffer(uint32 size) override; + void texture_releaseTextureUploadBuffer(uint8* mem) override; + + TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; + + void texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) override; + void texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) override; + void texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) override; + void texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) override; + + LatteTexture* texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) override; + + void texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) override; + void texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth) override; + + LatteTextureReadbackInfo* texture_createReadback(LatteTextureView* textureView) override; + + // surface copy + void surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) override; + + // buffer cache + void bufferCache_init(const sint32 bufferSize) override; + void bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) override; + void bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + void bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) override; + + void buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) override; + void buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) override; + + // shader + RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool compileAsync, bool isGfxPackSource) override; + + // streamout + void streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) override; + void streamout_begin() override; + void streamout_rendererFinishDrawcall() override; + + // core drawing logic + void draw_beginSequence() override; + void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; + void draw_endSequence() override; + + void draw_updateVertexBuffersDirectAccess(); + void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + + void draw_handleSpecialState5(); + + // index + IndexAllocation indexData_reserveIndexMemory(uint32 size) override; + void indexData_releaseIndexMemory(IndexAllocation& allocation) override; + void indexData_uploadIndexMemory(IndexAllocation& allocation) override; + + // occlusion queries + LatteQueryObject* occlusionQuery_create() override; + void occlusionQuery_destroy(LatteQueryObject* queryObj) override; + void occlusionQuery_flush() override; + void occlusionQuery_updateState() override; + + // Helpers + MetalPerformanceMonitor& GetPerformanceMonitor() { return m_performanceMonitor; } + + void SetShouldMaximizeConcurrentCompilation(bool shouldMaximizeConcurrentCompilation) + { + if (m_supportsMetal3) + m_device->setShouldMaximizeConcurrentCompilation(shouldMaximizeConcurrentCompilation); + } + + bool IsCommandBufferActive() const + { + return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited); + } + + MTL::CommandBuffer* GetCurrentCommandBuffer() const + { + cemu_assert_debug(m_currentCommandBuffer.m_commandBuffer); + + return m_currentCommandBuffer.m_commandBuffer; + } + + MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() const + { + // The command buffer has been commited and has finished execution + if (m_currentCommandBuffer.m_commited && m_executingCommandBuffers.size() == 0) + return nullptr; + + return GetCurrentCommandBuffer()->retain(); + } + + void RequestSoonCommit() + { + m_commitTreshold = m_recordedDrawcalls + 8; + } + + MTL::CommandEncoder* GetCommandEncoder() + { + return m_commandEncoder; + } + + MetalEncoderType GetEncoderType() + { + return m_encoderType; + } + + void ResetEncoderState() + { + m_state.m_encoderState = {}; + + // TODO: set viewport and scissor to render target dimensions if render commands + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + { + for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) + m_state.m_encoderState.m_buffers[i][j] = {nullptr}; + for (uint32 j = 0; j < MAX_MTL_TEXTURES; j++) + m_state.m_encoderState.m_textures[i][j] = nullptr; + for (uint32 j = 0; j < MAX_MTL_SAMPLERS; j++) + m_state.m_encoderState.m_samplers[i][j] = nullptr; + } + } + + MetalEncoderState& GetEncoderState() + { + return m_state.m_encoderState; + } + + void SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index); + void SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index); + void SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index); + + MTL::CommandBuffer* GetCommandBuffer(); + MTL::RenderCommandEncoder* GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor); + MTL::RenderCommandEncoder* GetRenderCommandEncoder(bool forceRecreate = false); + MTL::ComputeCommandEncoder* GetComputeCommandEncoder(); + MTL::BlitCommandEncoder* GetBlitCommandEncoder(); + void EndEncoding(); + void CommitCommandBuffer(); + void ProcessFinishedCommandBuffers(); + + bool AcquireDrawable(bool mainWindow); + + //bool CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader); + void BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader); + + void ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a); + + void CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before); + + // Getters + bool IsAppleGPU() const + { + return m_isAppleGPU; + } + + bool SupportsFramebufferFetch() const + { + return m_supportsFramebufferFetch; + } + + bool HasUnifiedMemory() const + { + return m_hasUnifiedMemory; + } + + bool SupportsMetal3() const + { + return m_supportsMetal3; + } + + //MTL::StorageMode GetOptimalTextureStorageMode() const + //{ + // return (m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModePrivate); + //} + + MTL::ResourceOptions GetOptimalBufferStorageMode() const + { + return (m_hasUnifiedMemory ? MTL::ResourceStorageModeShared : MTL::ResourceStorageModeManaged); + } + + MTL::Texture* GetNullTexture2D() const + { + return m_nullTexture2D; + } + + MTL::Buffer* GetTextureReadbackBuffer() + { + if (!m_readbackBuffer) + { + m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::ResourceStorageModeShared); +#ifdef CEMU_DEBUG_ASSERT + m_readbackBuffer->setLabel(GetLabel("Texture readback buffer", m_readbackBuffer)); +#endif + } + + return m_readbackBuffer; + } + + MTL::Buffer* GetXfbRingBuffer() + { + if (!m_xfbRingBuffer) + { + // HACK: using just LatteStreamout_GetRingBufferSize will cause page faults + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize() * 4, MTL::ResourceStorageModePrivate); +#ifdef CEMU_DEBUG_ASSERT + m_xfbRingBuffer->setLabel(GetLabel("Transform feedback buffer", m_xfbRingBuffer)); +#endif + } + + return m_xfbRingBuffer; + } + + MTL::Buffer* GetOcclusionQueryResultBuffer() const + { + return m_occlusionQuery.m_resultBuffer; + } + + uint64* GetOcclusionQueryResultsPtr() + { + return m_occlusionQuery.m_resultsPtr; + } + + uint32 GetOcclusionQueryIndex() + { + return m_occlusionQuery.m_currentIndex; + } + + void BeginOcclusionQuery() + { + m_occlusionQuery.m_active = true; + } + + void EndOcclusionQuery() + { + m_occlusionQuery.m_active = false; + + // Release the old command buffer + if (m_occlusionQuery.m_lastCommandBuffer) + m_occlusionQuery.m_lastCommandBuffer->release(); + + // Get and retain the current command buffer + m_occlusionQuery.m_lastCommandBuffer = GetAndRetainCurrentCommandBufferIfNotCompleted(); + } + + // GPU capture + void CaptureFrame() + { + m_captureFrame = true; + } + +private: + MetalLayerHandle m_mainLayer; + MetalLayerHandle m_padLayer; + + MetalPerformanceMonitor m_performanceMonitor; + + // Metal objects + MTL::Device* m_device = nullptr; + MTL::CommandQueue* m_commandQueue; + + // Feature support + bool m_isAppleGPU; + bool m_supportsFramebufferFetch; + bool m_hasUnifiedMemory; + bool m_supportsMetal3; + uint32 m_recommendedMaxVRAMUsage; + MetalPixelFormatSupport m_pixelFormatSupport; + + // Managers and caches + class MetalMemoryManager* m_memoryManager; + class MetalOutputShaderCache* m_outputShaderCache; + class MetalPipelineCache* m_pipelineCache; + class MetalDepthStencilCache* m_depthStencilCache; + class MetalSamplerCache* m_samplerCache; + + // Pipelines + MTL::RenderPipelineDescriptor* m_copyDepthToColorDesc; + std::map m_copyDepthToColorPipelines; + + // Void vertex pipelines + class MetalVoidVertexPipeline* m_copyBufferToBufferPipeline; + + // Synchronization resources + MTL::Event* m_event; + int32_t m_eventValue = -1; + + // Resources + MTL::SamplerState* m_nearestSampler; + MTL::SamplerState* m_linearSampler; + + // Null resources + MTL::Texture* m_nullTexture1D; + MTL::Texture* m_nullTexture2D; + + // Texture readback + MTL::Buffer* m_readbackBuffer = nullptr; + uint32 m_readbackBufferWriteOffset = 0; + + // Transform feedback + MTL::Buffer* m_xfbRingBuffer = nullptr; + + // Occlusion queries + struct + { + MTL::Buffer* m_resultBuffer; + uint64* m_resultsPtr; + uint32 m_currentIndex = 0; + bool m_active = false; + MTL::CommandBuffer* m_lastCommandBuffer = nullptr; + } m_occlusionQuery; + + // Active objects + MetalCommandBuffer m_currentCommandBuffer{}; + std::vector m_executingCommandBuffers; + MetalEncoderType m_encoderType = MetalEncoderType::None; + MTL::CommandEncoder* m_commandEncoder = nullptr; + + uint32 m_recordedDrawcalls; + uint32 m_defaultCommitTreshlod; + uint32 m_commitTreshold; + + // State + MetalState m_state; + + // GPU capture + bool m_captureFrame = false; + bool m_capturing = false; + + // Helpers + MetalLayerHandle& GetLayer(bool mainWindow) + { + return (mainWindow ? m_mainLayer : m_padLayer); + } + + void SwapBuffer(bool mainWindow); + + void EnsureImGuiBackend(); + + // GPU capture + void StartCapture(); + void EndCapture(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp new file mode 100644 index 000000000..b7d5a2ecd --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.cpp @@ -0,0 +1,129 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "HW/Latte/Renderer/Metal/LatteToMtl.h" + +MetalSamplerCache::~MetalSamplerCache() +{ + for (auto& pair : m_samplerCache) + { + pair.second->release(); + } + m_samplerCache.clear(); +} + +MTL::SamplerState* MetalSamplerCache::GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex) +{ + uint64 stateHash = CalculateSamplerHash(lcr, samplerIndex); + auto& samplerState = m_samplerCache[stateHash]; + if (samplerState) + return samplerState; + + // Sampler state + const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + + MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); + + // lod + uint32 iMinLOD = samplerWords->WORD1.get_MIN_LOD(); + uint32 iMaxLOD = samplerWords->WORD1.get_MAX_LOD(); + sint32 iLodBias = samplerWords->WORD1.get_LOD_BIAS(); + + // TODO: uncomment + // apply relative lod bias from graphic pack + //if (baseTexture->overwriteInfo.hasRelativeLodBias) + // iLodBias += baseTexture->overwriteInfo.relativeLodBias; + // apply absolute lod bias from graphic pack + //if (baseTexture->overwriteInfo.hasLodBias) + // iLodBias = baseTexture->overwriteInfo.lodBias; + + auto filterMip = samplerWords->WORD0.get_MIP_FILTER(); + if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::NONE) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp(0.0f); + samplerDescriptor->setLodMaxClamp(0.25f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::POINT) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterNearest); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else if (filterMip == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_Z_FILTER::LINEAR) + { + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + else + { + // fallback for invalid constants + samplerDescriptor->setMipFilter(MTL::SamplerMipFilterLinear); + samplerDescriptor->setLodMinClamp((float)iMinLOD / 64.0f); + samplerDescriptor->setLodMaxClamp((float)iMaxLOD / 64.0f); + } + + auto filterMin = samplerWords->WORD0.get_XY_MIN_FILTER(); + cemu_assert_debug(filterMin != Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::BICUBIC); // todo + samplerDescriptor->setMinFilter((filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterMag = samplerWords->WORD0.get_XY_MAG_FILTER(); + samplerDescriptor->setMagFilter((filterMag == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::POINT || filterMin == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_XY_FILTER::ANISO_POINT) ? MTL::SamplerMinMagFilterNearest : MTL::SamplerMinMagFilterLinear); + + auto filterZ = samplerWords->WORD0.get_Z_FILTER(); + // todo: z-filter for texture array samplers is customizable for GPU7 but OpenGL/Vulkan doesn't expose this functionality? + + auto clampX = samplerWords->WORD0.get_CLAMP_X(); + auto clampY = samplerWords->WORD0.get_CLAMP_Y(); + auto clampZ = samplerWords->WORD0.get_CLAMP_Z(); + + samplerDescriptor->setSAddressMode(GetMtlSamplerAddressMode(clampX)); + samplerDescriptor->setTAddressMode(GetMtlSamplerAddressMode(clampY)); + samplerDescriptor->setRAddressMode(GetMtlSamplerAddressMode(clampZ)); + + auto maxAniso = samplerWords->WORD0.get_MAX_ANISO_RATIO(); + + // TODO: uncomment + //if (baseTexture->overwriteInfo.anisotropicLevel >= 0) + // maxAniso = baseTexture->overwriteInfo.anisotropicLevel; + + if (maxAniso > 0) + samplerDescriptor->setMaxAnisotropy(1 << maxAniso); + + // TODO: set lod bias + //samplerInfo.mipLodBias = (float)iLodBias / 64.0f; + + // depth compare + //uint8 depthCompareMode = shader->textureUsesDepthCompare[relative_textureUnit] ? 1 : 0; + // TODO: is it okay to just cast? + samplerDescriptor->setCompareFunction(GetMtlCompareFunc((Latte::E_COMPAREFUNC)samplerWords->WORD0.get_DEPTH_COMPARE_FUNCTION())); + + // border + auto borderType = samplerWords->WORD0.get_BORDER_COLOR_TYPE(); + + if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::TRANSPARENT_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_BLACK) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueBlack); + else if (borderType == Latte::LATTE_SQ_TEX_SAMPLER_WORD0_0::E_BORDER_COLOR_TYPE::OPAQUE_WHITE) + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorOpaqueWhite); + else + { + // Metal doesn't support custom border color + cemuLog_logOnce(LogType::Force, "Custom border color is not supported in Metal, using transparent black instead"); + samplerDescriptor->setBorderColor(MTL::SamplerBorderColorTransparentBlack); + } + + samplerState = m_mtlr->GetDevice()->newSamplerState(samplerDescriptor); + samplerDescriptor->release(); + + return samplerState; +} + +uint64 MetalSamplerCache::CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex) +{ + const _LatteRegisterSetSampler* samplerWords = lcr.SQ_TEX_SAMPLER + samplerIndex; + + // TODO: check this + return *((uint64*)samplerWords); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h new file mode 100644 index 000000000..891d7e035 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "HW/Latte/ISA/LatteReg.h" + +class MetalSamplerCache +{ +public: + MetalSamplerCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} + ~MetalSamplerCache(); + + MTL::SamplerState* GetSamplerState(const LatteContextRegister& lcr, uint32 samplerIndex); + +private: + class MetalRenderer* m_mtlr; + + std::map m_samplerCache; + + uint64 CalculateSamplerHash(const LatteContextRegister& lcr, uint32 samplerIndex); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp new file mode 100644 index 000000000..6789505c3 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.cpp @@ -0,0 +1,25 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + +MetalVoidVertexPipeline::MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName) +{ + // Render pipeline state + MTL::Function* vertexFunction = library->newFunction(ToNSString(vertexFunctionName)); + + MTL::RenderPipelineDescriptor* renderPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init(); + renderPipelineDescriptor->setVertexFunction(vertexFunction); + renderPipelineDescriptor->setRasterizationEnabled(false); + + NS::Error* error = nullptr; + m_renderPipelineState = mtlRenderer->GetDevice()->newRenderPipelineState(renderPipelineDescriptor, &error); + renderPipelineDescriptor->release(); + vertexFunction->release(); + if (error) + { + cemuLog_log(LogType::Force, "error creating hybrid render pipeline state: {}", error->localizedDescription()->utf8String()); + } +} + +MetalVoidVertexPipeline::~MetalVoidVertexPipeline() +{ + m_renderPipelineState->release(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h new file mode 100644 index 000000000..57666a57a --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h @@ -0,0 +1,16 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Metal/MTLLibrary.hpp" +#include "Metal/MTLRenderPipeline.hpp" + +class MetalVoidVertexPipeline +{ +public: + MetalVoidVertexPipeline(class MetalRenderer* mtlRenderer, MTL::Library* library, const std::string& vertexFunctionName); + ~MetalVoidVertexPipeline(); + + MTL::RenderPipelineState* GetRenderPipelineState() const { return m_renderPipelineState; } + +private: + MTL::RenderPipelineState* m_renderPipelineState; +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp new file mode 100644 index 000000000..07073e08c --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.cpp @@ -0,0 +1,403 @@ +#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" + +#include "Cemu/FileCache/FileCache.h" +#include "config/ActiveSettings.h" +#include "Cemu/Logging/CemuLogging.h" +#include "Common/precompiled.h" +#include "GameProfile/GameProfile.h" +#include "util/helpers/helpers.h" + +#define METAL_AIR_CACHE_NAME "Cemu_AIR_cache" +#define METAL_AIR_CACHE_PATH "/Volumes/" METAL_AIR_CACHE_NAME +#define METAL_AIR_CACHE_SIZE (16 * 1024 * 1024) +#define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512) + +static bool s_isLoadingShadersMtl{false}; +//static bool s_hasRAMFilesystem{false}; +//class FileCache* s_airCache{nullptr}; + +extern std::atomic_int g_compiled_shaders_total; +extern std::atomic_int g_compiled_shaders_async; + +class ShaderMtlThreadPool +{ +public: + void StartThreads() + { + if (m_threadsActive.exchange(true)) + return; + + // Create thread pool + const uint32 threadCount = 2; + for (uint32 i = 0; i < threadCount; ++i) + s_threads.emplace_back(&ShaderMtlThreadPool::CompilerThreadFunc, this); + + // Create AIR cache thread + /* + s_airCacheThread = new std::thread(&ShaderMtlThreadPool::AIRCacheThreadFunc, this); + + // Set priority + sched_param schedParam; + schedParam.sched_priority = 20; + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_FIFO, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set FIFO thread priority"); + } + + if (pthread_setschedparam(s_airCacheThread->native_handle(), SCHED_RR, &schedParam) != 0) { + cemuLog_log(LogType::Force, "failed to set RR thread priority"); + } + */ + } + + void StopThreads() + { + if (!m_threadsActive.exchange(false)) + return; + for (uint32 i = 0; i < s_threads.size(); ++i) + s_compilationQueueCount.increment(); + for (auto& it : s_threads) + it.join(); + s_threads.clear(); + + /* + if (s_airCacheThread) + { + s_airCacheQueueCount.increment(); + s_airCacheThread->join(); + delete s_airCacheThread; + } + */ + } + + ~ShaderMtlThreadPool() + { + StopThreads(); + } + + void CompilerThreadFunc() + { + SetThreadName("mtlShaderComp"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_compilationQueueCount.decrementWithWait(); + s_compilationQueueMutex.lock(); + if (s_compilationQueue.empty()) + { + // queue empty again, shaders compiled synchronously via PreponeCompilation() + s_compilationQueueMutex.unlock(); + continue; + } + RendererShaderMtl* job = s_compilationQueue.front(); + s_compilationQueue.pop_front(); + // set compilation state + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::QUEUED); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::COMPILING); + s_compilationQueueMutex.unlock(); + // compile + job->CompileInternal(); + if (job->ShouldCountCompilation()) + ++g_compiled_shaders_async; + // mark as compiled + cemu_assert_debug(job->m_compilationState.getValue() == RendererShaderMtl::COMPILATION_STATE::COMPILING); + job->m_compilationState.setValue(RendererShaderMtl::COMPILATION_STATE::DONE); + } + } + + /* + void AIRCacheThreadFunc() + { + SetThreadName("mtlAIRCache"); + while (m_threadsActive.load(std::memory_order::relaxed)) + { + s_airCacheQueueCount.decrementWithWait(); + s_airCacheQueueMutex.lock(); + if (s_airCacheQueue.empty()) + { + s_airCacheQueueMutex.unlock(); + continue; + } + + // Create RAM filesystem + if (!s_hasRAMFilesystem) + { + executeCommand("diskutil erasevolume HFS+ {} $(hdiutil attach -nomount ram://{})", METAL_AIR_CACHE_NAME, METAL_AIR_CACHE_BLOCK_COUNT); + s_hasRAMFilesystem = true; + } + + RendererShaderMtl* job = s_airCacheQueue.front(); + s_airCacheQueue.pop_front(); + s_airCacheQueueMutex.unlock(); + // compile + job->CompileToAIR(); + } + } + */ + + bool HasThreadsRunning() const { return m_threadsActive; } + +public: + std::vector s_threads; + //std::thread* s_airCacheThread{nullptr}; + + std::deque s_compilationQueue; + CounterSemaphore s_compilationQueueCount; + std::mutex s_compilationQueueMutex; + + /* + std::deque s_airCacheQueue; + CounterSemaphore s_airCacheQueueCount; + std::mutex s_airCacheQueueMutex; + */ + +private: + std::atomic m_threadsActive; +} shaderMtlThreadPool; + +// TODO: find out if it would be possible to cache compiled Metal shaders +void RendererShaderMtl::ShaderCacheLoading_begin(uint64 cacheTitleId) +{ + s_isLoadingShadersMtl = true; + + // Open AIR cache + /* + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + uint32 airCacheMagic = GeneratePrecompiledCacheId(); + const std::string cacheFilename = fmt::format("{:016x}_air.bin", cacheTitleId); + const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}", cacheFilename); + s_airCache = FileCache::Open(cachePath, true, airCacheMagic); + if (!s_airCache) + cemuLog_log(LogType::Force, "Unable to open AIR cache {}", cacheFilename); + */ + + // Maximize shader compilation speed + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(true); +} + +void RendererShaderMtl::ShaderCacheLoading_end() +{ + s_isLoadingShadersMtl = false; + + // Reset shader compilation speed + static_cast(g_renderer.get())->SetShouldMaximizeConcurrentCompilation(false); +} + +void RendererShaderMtl::ShaderCacheLoading_Close() +{ + // Close the AIR cache + /* + if (s_airCache) + { + delete s_airCache; + s_airCache = nullptr; + } + + // Close RAM filesystem + if (s_hasRAMFilesystem) + executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH); + */ +} + +void RendererShaderMtl::Initialize() +{ + shaderMtlThreadPool.StartThreads(); +} + +void RendererShaderMtl::Shutdown() +{ + shaderMtlThreadPool.StopThreads(); +} + +RendererShaderMtl::RendererShaderMtl(MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode) + : RendererShader(type, baseHash, auxHash, isGameShader, isGfxPackShader), m_mtlr{mtlRenderer}, m_mslCode{mslCode} +{ + // start async compilation + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + m_compilationState.setValue(COMPILATION_STATE::QUEUED); + shaderMtlThreadPool.s_compilationQueue.push_back(this); + shaderMtlThreadPool.s_compilationQueueCount.increment(); + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + cemu_assert_debug(shaderMtlThreadPool.HasThreadsRunning()); // make sure .StartThreads() was called +} + +RendererShaderMtl::~RendererShaderMtl() +{ + if (m_function) + m_function->release(); +} + +void RendererShaderMtl::PreponeCompilation(bool isRenderThread) +{ + shaderMtlThreadPool.s_compilationQueueMutex.lock(); + bool isStillQueued = m_compilationState.hasState(COMPILATION_STATE::QUEUED); + if (isStillQueued) + { + // remove from queue + shaderMtlThreadPool.s_compilationQueue.erase(std::remove(shaderMtlThreadPool.s_compilationQueue.begin(), shaderMtlThreadPool.s_compilationQueue.end(), this), shaderMtlThreadPool.s_compilationQueue.end()); + m_compilationState.setValue(COMPILATION_STATE::COMPILING); + } + shaderMtlThreadPool.s_compilationQueueMutex.unlock(); + if (!isStillQueued) + { + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + if (ShouldCountCompilation()) + --g_compiled_shaders_async; // compilation caused a stall so we don't consider this one async + return; + } + else + { + // compile synchronously + CompileInternal(); + m_compilationState.setValue(COMPILATION_STATE::DONE); + } +} + +bool RendererShaderMtl::IsCompiled() +{ + return m_compilationState.hasState(COMPILATION_STATE::DONE); +}; + +bool RendererShaderMtl::WaitForCompiled() +{ + m_compilationState.waitUntilValue(COMPILATION_STATE::DONE); + return true; +} + +bool RendererShaderMtl::ShouldCountCompilation() const +{ + return !s_isLoadingShadersMtl && m_isGameShader; +} + +MTL::Library* RendererShaderMtl::LibraryFromSource() +{ + // Compile from source + MTL::CompileOptions* options = MTL::CompileOptions::alloc()->init(); + if (g_current_game_profile->GetFastMath()) + options->setFastMathEnabled(true); + if (g_current_game_profile->GetPositionInvariance()) + options->setPreserveInvariance(true); + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(ToNSString(m_mslCode), options, &error); + options->release(); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from source: {} -> {}", error->localizedDescription()->utf8String(), m_mslCode.c_str()); + return nullptr; + } + + return library; +} + +/* +MTL::Library* RendererShaderMtl::LibraryFromAIR(std::span data) +{ + dispatch_data_t dispatchData = dispatch_data_create(data.data(), data.size(), nullptr, DISPATCH_DATA_DESTRUCTOR_DEFAULT); + + NS::Error* error = nullptr; + MTL::Library* library = m_mtlr->GetDevice()->newLibrary(dispatchData, &error); + if (error) + { + cemuLog_log(LogType::Force, "failed to create library from AIR: {}", error->localizedDescription()->utf8String()); + return nullptr; + } + + return library; +} +*/ + +void RendererShaderMtl::CompileInternal() +{ + MTL::Library* library = nullptr; + + // First, try to retrieve the compiled shader from the AIR cache + /* + if (s_isLoadingShadersMtl && (m_isGameShader && !m_isGfxPackShader) && s_airCache) + { + cemu_assert_debug(m_baseHash != 0); + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + std::vector cacheFileData; + if (s_airCache->GetFile({ h1, h2 }, cacheFileData)) + { + library = LibraryFromAIR(std::span(cacheFileData.data(), cacheFileData.size())); + FinishCompilation(); + } + } + */ + + // Not in the cache, compile from source + if (!library) + { + // Compile from source + library = LibraryFromSource(); + if (!library) + return; + + // Store in the AIR cache + /* + shaderMtlThreadPool.s_airCacheQueueMutex.lock(); + shaderMtlThreadPool.s_airCacheQueue.push_back(this); + shaderMtlThreadPool.s_airCacheQueueCount.increment(); + shaderMtlThreadPool.s_airCacheQueueMutex.unlock(); + */ + } + + m_function = library->newFunction(ToNSString("main0")); + library->release(); + + // Count shader compilation + if (ShouldCountCompilation()) + g_compiled_shaders_total++; +} + +/* +void RendererShaderMtl::CompileToAIR() +{ + uint64 h1, h2; + GenerateShaderPrecompiledCacheFilename(m_type, m_baseHash, m_auxHash, h1, h2); + + // The shader is not in the cache, compile it + std::string baseFilename = fmt::format("{}/{}_{}", METAL_AIR_CACHE_PATH, h1, h2); + + // Source + std::ofstream mslFile; + mslFile.open(fmt::format("{}.metal", baseFilename)); + mslFile << m_mslCode; + mslFile.close(); + + // Compile + if (!executeCommand("xcrun -sdk macosx metal -o {}.ir -c {}.metal -w", baseFilename, baseFilename)) + return; + if (!executeCommand("xcrun -sdk macosx metallib -o {}.metallib {}.ir", baseFilename, baseFilename)) + return; + + // Clean up + executeCommand("rm {}.metal", baseFilename); + executeCommand("rm {}.ir", baseFilename); + + // Load from the newly generated AIR + MemoryMappedFile airFile(fmt::format("{}.metallib", baseFilename)); + std::span airData = std::span(airFile.data(), airFile.size()); + //library = LibraryFromAIR(std::span(airData.data(), airData.size())); + + // Store in the cache + s_airCache->AddFile({ h1, h2 }, airData.data(), airData.size()); + + // Clean up + executeCommand("rm {}.metallib", baseFilename); + + FinishCompilation(); +} +*/ + +void RendererShaderMtl::FinishCompilation() +{ + m_mslCode.clear(); + m_mslCode.shrink_to_fit(); +} diff --git a/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h new file mode 100644 index 000000000..9953ba746 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h @@ -0,0 +1,79 @@ +#pragma once + +#include "Cafe/HW/Latte/Renderer/RendererShader.h" +#include "HW/Latte/Renderer/Metal/CachedFBOMtl.h" +#include "HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "util/helpers/ConcurrentQueue.h" +#include "util/helpers/Semaphore.h" + +#include + +class RendererShaderMtl : public RendererShader +{ + friend class ShaderMtlThreadPool; + + enum class COMPILATION_STATE : uint32 + { + NONE, + QUEUED, + COMPILING, + DONE + }; + +public: + static void ShaderCacheLoading_begin(uint64 cacheTitleId); + static void ShaderCacheLoading_end(); + static void ShaderCacheLoading_Close(); + + static void Initialize(); + static void Shutdown(); + + RendererShaderMtl(class MetalRenderer* mtlRenderer, ShaderType type, uint64 baseHash, uint64 auxHash, bool isGameShader, bool isGfxPackShader, const std::string& mslCode); + virtual ~RendererShaderMtl(); + + MTL::Function* GetFunction() const + { + return m_function; + } + + sint32 GetUniformLocation(const char* name) override + { + cemu_assert_suspicious(); + return 0; + } + + void SetUniform2fv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } + + void SetUniform4iv(sint32 location, void* data, sint32 count) override + { + cemu_assert_suspicious(); + } + + void PreponeCompilation(bool isRenderThread) override; + bool IsCompiled() override; + bool WaitForCompiled() override; + +private: + class MetalRenderer* m_mtlr; + + MTL::Function* m_function = nullptr; + + StateSemaphore m_compilationState{ COMPILATION_STATE::NONE }; + + std::string m_mslCode; + + bool ShouldCountCompilation() const; + + MTL::Library* LibraryFromSource(); + + //MTL::Library* LibraryFromAIR(std::span data); + + void CompileInternal(); + + //void CompileToAIR(); + + void FinishCompilation(); +}; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h new file mode 100644 index 000000000..2041f4f88 --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -0,0 +1,51 @@ +#pragma once + +#define __STRINGIFY(x) #x +#define _STRINGIFY(x) __STRINGIFY(x) + +constexpr const char* utilityShaderSource = R"(#include +using namespace metal; + +#define GET_BUFFER_BINDING(index) (28 + index) +#define GET_TEXTURE_BINDING(index) (29 + index) +#define GET_SAMPLER_BINDING(index) (14 + index) + +constant float2 positions[] = {float2(-1.0, -3.0), float2(-1.0, 1.0), float2(3.0, 1.0)}; + +struct VertexOut { + float4 position [[position]]; + float2 texCoord; +}; + +vertex VertexOut vertexFullscreen(ushort vid [[vertex_id]]) { + VertexOut out; + out.position = float4(positions[vid], 0.0, 1.0); + out.texCoord = positions[vid] * 0.5 + 0.5; + out.texCoord.y = 1.0 - out.texCoord.y; + + return out; +} + +//fragment float4 fragmentPresent(VertexOut in [[stage_in]], texture2d tex [[texture(0)]], //sampler samplr [[sampler(0)]]) { +// return tex.sample(samplr, in.texCoord); +//} + +vertex void vertexCopyBufferToBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]]) { + dst[vid] = src[vid]; +} + +fragment float4 fragmentCopyDepthToColor(VertexOut in [[stage_in]], texture2d src [[texture(GET_TEXTURE_BINDING(0))]]) { + return float4(src.read(uint2(in.position.xy)).r, 0.0, 0.0, 0.0); +} + +//struct RestrideParams { +// uint oldStride; +// uint newStride; +//}; + +//vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer//(GET_BUFFER_BINDING(0))]], device uint8_t* dst [[buffer(GET_BUFFER_BINDING(1))]], constant //RestrideParams& params [[buffer(GET_BUFFER_BINDING(2))]]) { +// for (uint32_t i = 0; i < params.oldStride; i++) { +// dst[vid * params.newStride + i] = src[vid * params.oldStride + i]; +// } +//} +)"; diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.h b/src/Cafe/HW/Latte/Renderer/MetalView.h new file mode 100644 index 000000000..43e5c7b3f --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.h @@ -0,0 +1,7 @@ +#pragma once + +#import +#import + +@interface MetalView : NSView +@end diff --git a/src/Cafe/HW/Latte/Renderer/MetalView.mm b/src/Cafe/HW/Latte/Renderer/MetalView.mm new file mode 100644 index 000000000..5ca17b5ef --- /dev/null +++ b/src/Cafe/HW/Latte/Renderer/MetalView.mm @@ -0,0 +1,26 @@ +#include "Cafe/HW/Latte/Renderer/MetalView.h" + +@implementation MetalView + +-(BOOL) wantsUpdateLayer { return YES; } + ++(Class) layerClass { return [CAMetalLayer class]; } + +// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m + +-(CALayer*) makeBackingLayer +{ + CALayer* layer = [self.class.layerClass layer]; + CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; + layer.contentsScale = MIN(viewScale.width, viewScale.height); + return layer; +} + +-(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window +{ + if (newScale == layer.contentsScale) { return NO; } + + layer.contentsScale = newScale; + return YES; +} +@end diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h index e29e9d4c4..ddc7ee22b 100644 --- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h @@ -248,11 +248,11 @@ class OpenGLRenderer : public Renderer // occlusion queries std::vector list_queryCacheOcclusion; // cache for unused queries - // resource garbage collection + // resource garbage collection struct BufferCacheReleaseQueueEntry { BufferCacheReleaseQueueEntry(VirtualBufferHeap_t* heap, VirtualBufferHeapEntry_t* entry) : m_heap(heap), m_entry(entry) {}; - + void free() { virtualBufferHeap_free(m_heap, m_entry); diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 77d588b96..a02eeb108 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -33,6 +33,7 @@ enum class RendererAPI { OpenGL, Vulkan, + Metal, MAX }; @@ -66,9 +67,9 @@ class Renderer virtual void SwapBuffers(bool swapTV, bool swapDRC) = 0; virtual void HandleScreenshotRequest(LatteTextureView* texView, bool padView){} - - virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, - sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, + + virtual void DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, + sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) = 0; virtual bool BeginFrame(bool mainWindow) = 0; @@ -84,6 +85,7 @@ class Renderer virtual void DeleteFontTextures() = 0; GfxVendor GetVendor() const { return m_vendor; } + virtual bool UseTFViaSSBO() const { return false; } virtual void AppendOverlayDebugInfo() = 0; // rendertarget diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp index 409dc24fa..55c97a3a1 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.cpp @@ -1,5 +1,6 @@ #include "Cafe/HW/Latte/Renderer/RendererOuputShader.h" #include "Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h" +#include "HW/Latte/Renderer/Renderer.h" const std::string RendererOutputShader::s_copy_shader_source = R"( @@ -9,6 +10,19 @@ void main() } )"; +const std::string RendererOutputShader::s_copy_shader_source_mtl = +R"(#include +using namespace metal; + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + return float4(textureSrc.sample(samplr, in.uv).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_bicubic_shader_source = R"( vec4 cubic(float x) @@ -55,6 +69,57 @@ void main(){ } )"; +const std::string RendererOutputShader::s_bicubic_shader_source_mtl = +R"(#include +using namespace metal; + +float4 cubic(float x) { + float x2 = x * x; + float x3 = x2 * x; + float4 w; + w.x = -x3 + 3 * x2 - 3 * x + 1; + w.y = 3 * x3 - 6 * x2 + 4; + w.z = -3 * x3 + 3 * x2 + 3 * x + 1; + w.w = x3; + return w / 6.0; +} + +float4 bcFilter(texture2d textureSrc, sampler samplr, float2 texcoord, float2 texscale) { + float fx = fract(texcoord.x); + float fy = fract(texcoord.y); + texcoord.x -= fx; + texcoord.y -= fy; + + float4 xcubic = cubic(fx); + float4 ycubic = cubic(fy); + + float4 c = float4(texcoord.x - 0.5, texcoord.x + 1.5, texcoord.y - 0.5, texcoord.y + 1.5); + float4 s = float4(xcubic.x + xcubic.y, xcubic.z + xcubic.w, ycubic.x + ycubic.y, ycubic.z + ycubic.w); + float4 offset = c + float4(xcubic.y, xcubic.w, ycubic.y, ycubic.w) / s; + + float4 sample0 = textureSrc.sample(samplr, float2(offset.x, offset.z) * texscale); + float4 sample1 = textureSrc.sample(samplr, float2(offset.y, offset.z) * texscale); + float4 sample2 = textureSrc.sample(samplr, float2(offset.x, offset.w) * texscale); + float4 sample3 = textureSrc.sample(samplr, float2(offset.y, offset.w) * texscale); + + float sx = s.x / (s.x + s.y); + float sy = s.z / (s.z + s.w); + + return mix( + mix(sample3, sample2, sx), + mix(sample1, sample0, sx), sy); +} + +struct VertexOut { + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]]) { + float2 textureSrcResolution = float2(textureSrc.get_width(), textureSrc.get_height()); + return float4(bcFilter(textureSrc, samplr, in.uv * textureSrcResolution, float2(1.0, 1.0) / textureSrcResolution).rgb, 1.0); +} +)"; + const std::string RendererOutputShader::s_hermite_shader_source = R"( // https://www.shadertoy.com/view/MllSzX @@ -67,7 +132,7 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; vec3 c = -A/2.0 + C/2.0; vec3 d = B; - + return a*t3 + b*t2 + c*t + d; } @@ -75,36 +140,36 @@ vec3 CubicHermite (vec3 A, vec3 B, vec3 C, vec3 D, float t) vec3 BicubicHermiteTexture(vec2 uv, vec4 texelSize) { vec2 pixel = uv*texelSize.zw + 0.5; - vec2 frac = fract(pixel); + vec2 frac = fract(pixel); pixel = floor(pixel) / texelSize.zw - vec2(texelSize.xy/2.0); - + vec4 doubleSize = texelSize*2.0; vec3 C00 = texture(textureSrc, pixel + vec2(-texelSize.x ,-texelSize.y)).rgb; vec3 C10 = texture(textureSrc, pixel + vec2( 0.0 ,-texelSize.y)).rgb; vec3 C20 = texture(textureSrc, pixel + vec2( texelSize.x ,-texelSize.y)).rgb; vec3 C30 = texture(textureSrc, pixel + vec2( doubleSize.x,-texelSize.y)).rgb; - + vec3 C01 = texture(textureSrc, pixel + vec2(-texelSize.x , 0.0)).rgb; vec3 C11 = texture(textureSrc, pixel + vec2( 0.0 , 0.0)).rgb; vec3 C21 = texture(textureSrc, pixel + vec2( texelSize.x , 0.0)).rgb; - vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; - + vec3 C31 = texture(textureSrc, pixel + vec2( doubleSize.x, 0.0)).rgb; + vec3 C02 = texture(textureSrc, pixel + vec2(-texelSize.x , texelSize.y)).rgb; vec3 C12 = texture(textureSrc, pixel + vec2( 0.0 , texelSize.y)).rgb; vec3 C22 = texture(textureSrc, pixel + vec2( texelSize.x , texelSize.y)).rgb; - vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; - + vec3 C32 = texture(textureSrc, pixel + vec2( doubleSize.x, texelSize.y)).rgb; + vec3 C03 = texture(textureSrc, pixel + vec2(-texelSize.x , doubleSize.y)).rgb; vec3 C13 = texture(textureSrc, pixel + vec2( 0.0 , doubleSize.y)).rgb; vec3 C23 = texture(textureSrc, pixel + vec2( texelSize.x , doubleSize.y)).rgb; - vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; - + vec3 C33 = texture(textureSrc, pixel + vec2( doubleSize.x, doubleSize.y)).rgb; + vec3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); vec3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); vec3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); vec3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); - + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); } @@ -114,9 +179,77 @@ void main(){ } )"; +const std::string RendererOutputShader::s_hermite_shader_source_mtl = +R"(#include +using namespace metal; + +// https://www.shadertoy.com/view/MllSzX + +float3 CubicHermite(float3 A, float3 B, float3 C, float3 D, float t) { + float t2 = t*t; + float t3 = t*t*t; + float3 a = -A/2.0 + (3.0*B)/2.0 - (3.0*C)/2.0 + D/2.0; + float3 b = A - (5.0*B)/2.0 + 2.0*C - D / 2.0; + float3 c = -A/2.0 + C/2.0; + float3 d = B; + + return a*t3 + b*t2 + c*t + d; +} + + +float3 BicubicHermiteTexture(texture2d textureSrc, sampler samplr, float2 uv, float4 texelSize) { + float2 pixel = uv*texelSize.zw + 0.5; + float2 frac = fract(pixel); + pixel = floor(pixel) / texelSize.zw - float2(texelSize.xy/2.0); + + float4 doubleSize = texelSize*texelSize; + + float3 C00 = textureSrc.sample(samplr, pixel + float2(-texelSize.x ,-texelSize.y)).rgb; + float3 C10 = textureSrc.sample(samplr, pixel + float2( 0.0 ,-texelSize.y)).rgb; + float3 C20 = textureSrc.sample(samplr, pixel + float2( texelSize.x ,-texelSize.y)).rgb; + float3 C30 = textureSrc.sample(samplr, pixel + float2( doubleSize.x,-texelSize.y)).rgb; + + float3 C01 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , 0.0)).rgb; + float3 C11 = textureSrc.sample(samplr, pixel + float2( 0.0 , 0.0)).rgb; + float3 C21 = textureSrc.sample(samplr, pixel + float2( texelSize.x , 0.0)).rgb; + float3 C31 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, 0.0)).rgb; + + float3 C02 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , texelSize.y)).rgb; + float3 C12 = textureSrc.sample(samplr, pixel + float2( 0.0 , texelSize.y)).rgb; + float3 C22 = textureSrc.sample(samplr, pixel + float2( texelSize.x , texelSize.y)).rgb; + float3 C32 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, texelSize.y)).rgb; + + float3 C03 = textureSrc.sample(samplr, pixel + float2(-texelSize.x , doubleSize.y)).rgb; + float3 C13 = textureSrc.sample(samplr, pixel + float2( 0.0 , doubleSize.y)).rgb; + float3 C23 = textureSrc.sample(samplr, pixel + float2( texelSize.x , doubleSize.y)).rgb; + float3 C33 = textureSrc.sample(samplr, pixel + float2( doubleSize.x, doubleSize.y)).rgb; + + float3 CP0X = CubicHermite(C00, C10, C20, C30, frac.x); + float3 CP1X = CubicHermite(C01, C11, C21, C31, frac.x); + float3 CP2X = CubicHermite(C02, C12, C22, C32, frac.x); + float3 CP3X = CubicHermite(C03, C13, C23, C33, frac.x); + + return CubicHermite(CP0X, CP1X, CP2X, CP3X, frac.y); +} + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +fragment float4 main0(VertexOut in [[stage_in]], texture2d textureSrc [[texture(0)]], sampler samplr [[sampler(0)]], constant float2& outputResolution [[buffer(0)]]) { + float4 texelSize = float4(1.0 / outputResolution.xy, outputResolution.xy); + return float4(BicubicHermiteTexture(textureSrc, samplr, in.uv, texelSize), 1.0); +} +)"; + RendererOutputShader::RendererOutputShader(const std::string& vertex_source, const std::string& fragment_source) { - auto finalFragmentSrc = PrependFragmentPreamble(fragment_source); + std::string finalFragmentSrc; + if (g_renderer->GetType() == RendererAPI::Metal) + finalFragmentSrc = fragment_source; + else + finalFragmentSrc = PrependFragmentPreamble(fragment_source); m_vertex_shader = g_renderer->shader_create(RendererShader::ShaderType::kVertex, 0, 0, vertex_source, false, false); m_fragment_shader = g_renderer->shader_create(RendererShader::ShaderType::kFragment, 0, 0, finalFragmentSrc, false, false); @@ -190,9 +323,9 @@ std::string RendererOutputShader::GetOpenGlVertexSource(bool render_upside_down) R"(#version 400 out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -226,7 +359,7 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); } )"; return vertex_source.str(); @@ -240,9 +373,9 @@ std::string RendererOutputShader::GetVulkanVertexSource(bool render_upside_down) R"(#version 450 layout(location = 0) out vec2 passUV; -out gl_PerVertex -{ - vec4 gl_Position; +out gl_PerVertex +{ + vec4 gl_Position; }; void main(){ @@ -276,7 +409,45 @@ void main(){ vertex_source << R"( passUV = vUV; - gl_Position = vec4(vPos, 0.0, 1.0); + gl_Position = vec4(vPos, 0.0, 1.0); +} +)"; + return vertex_source.str(); +} + +std::string RendererOutputShader::GetMetalVertexSource(bool render_upside_down) +{ + // vertex shader + std::ostringstream vertex_source; + vertex_source << + R"(#include +using namespace metal; + +struct VertexOut { + float4 position [[position]]; + float2 uv; +}; + +vertex VertexOut main0(ushort vid [[vertex_id]]) { + VertexOut out; + float2 pos; + if (vid == 0) pos = float2(-1.0, -3.0); + else if (vid == 1) pos = float2(-1.0, 1.0); + else if (vid == 2) pos = float2(3.0, 1.0); + out.uv = pos * 0.5 + 0.5; + out.uv.y = 1.0 - out.uv.y; +)"; + + if (render_upside_down) + { + vertex_source << + R"( pos.y = -pos.y; + )"; + } + + vertex_source << + R"( out.position = float4(pos, 0.0, 1.0); + return out; } )"; return vertex_source.str(); @@ -304,24 +475,41 @@ layout(location = 0) out vec4 colorOut0; } void RendererOutputShader::InitializeStatic() { - std::string vertex_source, vertex_source_ud; - // vertex shader - if (g_renderer->GetType() == RendererAPI::OpenGL) - { - vertex_source = GetOpenGlVertexSource(false); - vertex_source_ud = GetOpenGlVertexSource(true); - } - else - { - vertex_source = GetVulkanVertexSource(false); - vertex_source_ud = GetVulkanVertexSource(true); - } - s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); - s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); - - s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); - s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); - - s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); - s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + if (g_renderer->GetType() == RendererAPI::Metal) + { + std::string vertex_source = GetMetalVertexSource(false); + std::string vertex_source_ud = GetMetalVertexSource(true); + + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source_mtl); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source_mtl); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source_mtl); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source_mtl); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source_mtl); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source_mtl); + } + else + { + std::string vertex_source, vertex_source_ud; + // vertex shader + if (g_renderer->GetType() == RendererAPI::OpenGL) + { + vertex_source = GetOpenGlVertexSource(false); + vertex_source_ud = GetOpenGlVertexSource(true); + } + else if (g_renderer->GetType() == RendererAPI::Vulkan) + { + vertex_source = GetVulkanVertexSource(false); + vertex_source_ud = GetVulkanVertexSource(true); + } + s_copy_shader = new RendererOutputShader(vertex_source, s_copy_shader_source); + s_copy_shader_ud = new RendererOutputShader(vertex_source_ud, s_copy_shader_source); + + s_bicubic_shader = new RendererOutputShader(vertex_source, s_bicubic_shader_source); + s_bicubic_shader_ud = new RendererOutputShader(vertex_source_ud, s_bicubic_shader_source); + + s_hermit_shader = new RendererOutputShader(vertex_source, s_hermite_shader_source); + s_hermit_shader_ud = new RendererOutputShader(vertex_source_ud, s_hermite_shader_source); + } } diff --git a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h index 61b24c20d..fec8dcde8 100644 --- a/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h +++ b/src/Cafe/HW/Latte/Renderer/RendererOuputShader.h @@ -40,8 +40,9 @@ class RendererOutputShader static RendererOutputShader* s_hermit_shader; static RendererOutputShader* s_hermit_shader_ud; - static std::string GetVulkanVertexSource(bool render_upside_down); static std::string GetOpenGlVertexSource(bool render_upside_down); + static std::string GetVulkanVertexSource(bool render_upside_down); + static std::string GetMetalVertexSource(bool render_upside_down); static std::string PrependFragmentPreamble(const std::string& shaderSrc); @@ -63,4 +64,8 @@ class RendererOutputShader static const std::string s_bicubic_shader_source_vk; static const std::string s_hermite_shader_source_vk; + + static const std::string s_copy_shader_source_mtl; + static const std::string s_bicubic_shader_source_mtl; + static const std::string s_hermite_shader_source_mtl; }; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm index 731a6a267..a68174c93 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.mm @@ -1,36 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/CocoaSurface.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" -#import -#import - -@interface MetalView : NSView -@end - -@implementation MetalView - --(BOOL) wantsUpdateLayer { return YES; } - -+(Class) layerClass { return [CAMetalLayer class]; } - -// copied from https://github.com/KhronosGroup/MoltenVK/blob/master/Demos/Cube/macOS/DemoViewController.m - --(CALayer*) makeBackingLayer -{ - CALayer* layer = [self.class.layerClass layer]; - CGSize viewScale = [self convertSizeToBacking: CGSizeMake(1.0, 1.0)]; - layer.contentsScale = MIN(viewScale.width, viewScale.height); - return layer; -} - --(BOOL) layer: (CALayer *)layer shouldInheritContentsScale: (CGFloat)newScale fromWindow: (NSWindow *)window -{ - if (newScale == layer.contentsScale) { return NO; } - - layer.contentsScale = newScale; - return YES; -} -@end +#include "Cafe/HW/Latte/Renderer/MetalView.h" VkSurfaceKHR CreateCocoaSurface(VkInstance instance, void* handle) { diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 66369c103..6428b42a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -65,7 +65,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(VkDebugUtilsMessageSeverityFla if (strstr(pCallbackData->pMessage, "consumes input location")) return VK_FALSE; // false means we dont care if (strstr(pCallbackData->pMessage, "blend")) - return VK_FALSE; // + return VK_FALSE; // // note: Check if previously used location in VK_EXT_debug_report callback is the same as messageIdNumber under the new extension // validation errors which are difficult to fix @@ -389,8 +389,8 @@ VulkanRenderer::VulkanRenderer() auto surface = CreateFramebufferSurface(m_instance, gui_getWindowInfo().window_main); auto& config = GetConfig(); - decltype(config.graphic_device_uuid) zero{}; - const bool has_device_set = config.graphic_device_uuid != zero; + decltype(config.vk_graphic_device_uuid) zero{}; + const bool has_device_set = config.vk_graphic_device_uuid != zero; VkPhysicalDevice fallbackDevice = VK_NULL_HANDLE; @@ -410,7 +410,7 @@ VulkanRenderer::VulkanRenderer() physDeviceProps.pNext = &physDeviceIDProps; vkGetPhysicalDeviceProperties2(device, &physDeviceProps); - if (memcmp(config.graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) + if (memcmp(config.vk_graphic_device_uuid.data(), physDeviceIDProps.deviceUUID, VK_UUID_SIZE) != 0) continue; } @@ -423,7 +423,7 @@ VulkanRenderer::VulkanRenderer() { cemuLog_log(LogType::Force, "The selected GPU could not be found or is not suitable. Falling back to first available device instead"); m_physicalDevice = fallbackDevice; - config.graphic_device_uuid = {}; // resetting device selection + config.vk_graphic_device_uuid = {}; // resetting device selection } else if (m_physicalDevice == VK_NULL_HANDLE) { @@ -2324,7 +2324,7 @@ void VulkanRenderer::GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isD } else { formatInfoOut->vkImageFormat = VK_FORMAT_R4G4B4A4_UNORM_PACK16; - formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_RGBA4_vk::getInstance(); + formatInfoOut->decoder = TextureDecoder_R4_G4_UNORM_To_ABGR4::getInstance(); } } else diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 5ef4558da..01b3def33 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -73,11 +73,11 @@ class PipelineInfo return true; } - + template struct direct_hash { - size_t operator()(const uint64& k) const noexcept + size_t operator()(const uint64& k) const noexcept { return k; } @@ -277,7 +277,6 @@ class VulkanRenderer : public Renderer // texture functions void* texture_acquireTextureUploadBuffer(uint32 size) override; void texture_releaseTextureUploadBuffer(uint8* mem) override; - TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; @@ -371,7 +370,7 @@ class VulkanRenderer : public Renderer VkRect2D currentScissorRect{}; // vertex bindings - struct + struct { uint32 offset; }currentVertexBinding[LATTE_MAX_VERTEX_BUFFERS]{}; @@ -464,12 +463,12 @@ class VulkanRenderer : public Renderer bool debug_utils = false; // VK_EXT_DEBUG_UTILS }instanceExtensions; - struct + struct { bool useTFEmulationViaSSBO = true; // emulate transform feedback via shader writes to a storage buffer }mode; - struct + struct { uint32 minUniformBufferOffsetAlignment = 256; uint32 nonCoherentAtomSize = 256; @@ -499,7 +498,7 @@ class VulkanRenderer : public Renderer void CreateCommandBuffers(); void swapchain_createDescriptorSetLayout(); - + // shader bool IsAsyncPipelineAllowed(uint32 numIndices); @@ -514,6 +513,8 @@ class VulkanRenderer : public Renderer void DeleteFontTextures() override; bool BeginFrame(bool mainWindow) override; + bool UseTFViaSSBO() const override { return m_featureControl.mode.useTFEmulationViaSSBO; } + // drawcall emulation PipelineInfo* draw_createGraphicsPipeline(uint32 indexCount); PipelineInfo* draw_getOrCreateGraphicsPipeline(uint32 indexCount); @@ -576,7 +577,7 @@ class VulkanRenderer : public Renderer VkDevice m_logicalDevice = VK_NULL_HANDLE; VkDebugUtilsMessengerEXT m_debugCallback = nullptr; volatile bool m_destructionRequested = false; - + QueueFamilyIndices m_indices{}; Semaphore m_pipeline_cache_semaphore; @@ -585,7 +586,7 @@ class VulkanRenderer : public Renderer VkPipelineCache m_pipeline_cache{ nullptr }; VkPipelineLayout m_pipelineLayout{nullptr}; VkCommandPool m_commandPool{ nullptr }; - + // buffer to cache uniform vars VkBuffer m_uniformVarBuffer = VK_NULL_HANDLE; VkDeviceMemory m_uniformVarBufferMemory = VK_NULL_HANDLE; @@ -657,19 +658,19 @@ class VulkanRenderer : public Renderer bool m_submitOnIdle{}; // submit current buffer if Latte command processor goes into idle state (no more commands or waiting for externally signaled condition) // tracking for dynamic offsets - struct + struct { uint32 uniformVarBufferOffset[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; - struct + struct { uint32 uniformBufferOffset[LATTE_NUM_MAX_UNIFORM_BUFFERS]; }shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; }dynamicOffsetInfo{}; // streamout - struct + struct { - struct + struct { bool enabled; uint32 ringBufferOffset; @@ -719,11 +720,11 @@ class VulkanRenderer : public Renderer accessFlags = 0; if constexpr ((TSyncOp & BUFFER_SHADER_READ) != 0) { - // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated + // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; accessFlags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_SHADER_READ_BIT; } - + if constexpr ((TSyncOp & BUFFER_SHADER_WRITE) != 0) { stages |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; @@ -926,7 +927,6 @@ class VulkanRenderer : public Renderer public: bool GetDisableMultithreadedCompilation() const { return m_featureControl.disableMultithreadedCompilation; } - bool UseTFViaSSBO() const { return m_featureControl.mode.useTFEmulationViaSSBO; } bool HasSPRIVRoundingModeRTE32() const { return m_featureControl.shaderFloatControls.shaderRoundingModeRTEFloat32; } bool IsDebugUtilsEnabled() const { return m_featureControl.debugMarkersSupported && m_featureControl.instanceExtensions.debug_utils; } @@ -936,7 +936,7 @@ class VulkanRenderer : public Renderer void debug_genericBarrier(); // shaders - struct + struct { RendererShaderVk* copySurface_vs{}; RendererShaderVk* copySurface_psDepth2Color{}; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 198a32cb8..9a57eeaba 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -60,7 +60,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader uint64 stateHash; stateHash = draw_calculateMinimalGraphicsPipelineHash(fetchShader, lcr); stateHash = (stateHash >> 8) + (stateHash * 0x370531ull) % 0x7F980D3BF9B4639Dull; - + uint32* ctxRegister = lcr.GetRawView(); if (vertexShader) @@ -103,7 +103,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader } stateHash += renderPassObj->m_hashForPipeline; - + uint32 depthControl = ctxRegister[Latte::REGADDR::DB_DEPTH_CONTROL]; bool stencilTestEnable = depthControl & 1; if (stencilTestEnable) @@ -111,7 +111,7 @@ uint64 VulkanRenderer::draw_calculateGraphicsPipelineHash(const LatteFetchShader stateHash += ctxRegister[mmDB_STENCILREFMASK]; stateHash = std::rotl(stateHash, 17); if(depthControl & (1<<7)) // back stencil enable - { + { stateHash += ctxRegister[mmDB_STENCILREFMASK_BF]; stateHash = std::rotl(stateHash, 13); } @@ -302,7 +302,7 @@ PipelineInfo* VulkanRenderer::draw_createGraphicsPipeline(uint32 indexCount) pipelineCompiler->TrackAsCached(vsBaseHash, pipelineHash); // use heuristics based on parameter patterns to determine if the current drawcall is essential (non-skipable) - bool allowAsyncCompile = false; + bool allowAsyncCompile = false; if (GetConfig().async_compile) allowAsyncCompile = IsAsyncPipelineAllowed(indexCount); @@ -735,8 +735,8 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* LatteTexture* baseTexture = textureView->baseTexture; // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; - - auto imageViewObj = textureView->GetSamplerView(word4); + + auto imageViewObj = textureView->GetSamplerView(word4); info.imageView = imageViewObj->m_textureImageView; vkObjDS->addRef(imageViewObj); @@ -806,7 +806,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* VK_SAMPLER_ADDRESS_MODE_REPEAT, // WRAP VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, // MIRROR VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // CLAMP_LAST_TEXEL - VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL + VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, // MIRROR_ONCE_LAST_TEXEL VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, // unsupported HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // unsupported MIRROR_ONCE_HALF_BORDER VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, // CLAMP_BORDER @@ -933,7 +933,7 @@ VkDescriptorSetInfo* VulkanRenderer::draw_getOrCreateDescriptorSet(PipelineInfo* uniformVarsBufferInfo.buffer = m_uniformVarBuffer; uniformVarsBufferInfo.offset = 0; // fixed offset is always zero since we only use dynamic offsets uniformVarsBufferInfo.range = shader->uniform.uniformRangeSize; - + VkWriteDescriptorSet write_descriptor{}; write_descriptor.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; write_descriptor.dstSet = result; @@ -1244,7 +1244,7 @@ void VulkanRenderer::draw_setRenderPass() draw_endRenderPass(); if (m_state.descriptorSetsChanged) sync_inputTexturesChanged(); - + // assume that FBO changed, update self-dependency state m_state.hasRenderSelfDependency = fboVk->CheckForCollision(m_state.activeVertexDS, m_state.activeGeometryDS, m_state.activePixelDS); diff --git a/src/Cemu/FileCache/FileCache.cpp b/src/Cemu/FileCache/FileCache.cpp index b284b66bd..820115d66 100644 --- a/src/Cemu/FileCache/FileCache.cpp +++ b/src/Cemu/FileCache/FileCache.cpp @@ -111,7 +111,7 @@ FileCache* FileCache::Create(const fs::path& path, uint32 extraVersion) fileCache->fileTableEntries[0].fileOffset = fileCache->fileTableOffset; fileCache->fileTableEntries[0].fileSize = fileCache->fileTableSize; // write header - + fs->writeU32(FILECACHE_MAGIC_V3); fs->writeU32(fileCache->extraVersion); fs->writeU64(fileCache->dataOffset); @@ -316,7 +316,7 @@ bool _uncompressFileData(const uint8* rawData, size_t rawSize, std::vectorSetPosition(this->dataOffset + currentStartOffset); fileStream->writeData(rawData, rawSize); +#ifdef __APPLE__ + fileStream->Flush(); +#endif // write file table entry fileStream->SetPosition(this->dataOffset + this->fileTableOffset + (uint64)(sizeof(FileTableEntry)*entryIndex)); fileStream->writeData(this->fileTableEntries + entryIndex, sizeof(FileTableEntry)); +#ifdef __APPLE__ + fileStream->Flush(); +#endif if (isCompressed) free(rawData); } diff --git a/src/Cemu/Logging/CemuLogging.cpp b/src/Cemu/Logging/CemuLogging.cpp index 5cde2a7fb..d7e3bc3e6 100644 --- a/src/Cemu/Logging/CemuLogging.cpp +++ b/src/Cemu/Logging/CemuLogging.cpp @@ -158,7 +158,7 @@ bool cemuLog_log(LogType type, std::string_view text) bool cemuLog_log(LogType type, std::u8string_view text) { - std::basic_string_view s((char*)text.data(), text.size()); + std::basic_string_view s((char*)text.data(), text.size()); return cemuLog_log(type, s); } diff --git a/src/Cemu/Logging/CemuLogging.h b/src/Cemu/Logging/CemuLogging.h index 5b2e5fa40..fae134b47 100644 --- a/src/Cemu/Logging/CemuLogging.h +++ b/src/Cemu/Logging/CemuLogging.h @@ -52,7 +52,7 @@ enum class LogType : sint32 template <> struct fmt::formatter : formatter { template - auto format(std::u8string_view v, FormatContext& ctx) + auto format(std::u8string_view v, FormatContext& ctx) { string_view s((char*)v.data(), v.size()); return formatter::format(s, ctx); @@ -100,7 +100,7 @@ bool cemuLog_log(LogType type, std::basic_string formatStr, TArgs&&... args) } return true; } - + template bool cemuLog_log(LogType type, const T* format, TArgs&&... args) { diff --git a/src/Common/unix/FileStream_unix.cpp b/src/Common/unix/FileStream_unix.cpp index 4bc9b5263..0e9f11895 100644 --- a/src/Common/unix/FileStream_unix.cpp +++ b/src/Common/unix/FileStream_unix.cpp @@ -116,6 +116,11 @@ void FileStream::extract(std::vector& data) readData(data.data(), fileSize); } +void FileStream::Flush() +{ + m_fileStream.flush(); +} + uint32 FileStream::readData(void* data, uint32 length) { SyncReadWriteSeek(false); diff --git a/src/Common/unix/FileStream_unix.h b/src/Common/unix/FileStream_unix.h index 12c971d14..0a2fa7ed9 100644 --- a/src/Common/unix/FileStream_unix.h +++ b/src/Common/unix/FileStream_unix.h @@ -22,6 +22,8 @@ class FileStream bool SetEndOfFile(); void extract(std::vector& data); + void Flush(); + // reading uint32 readData(void* data, uint32 length); bool readU64(uint64& v); diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 6bb7ac34c..7542dc310 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -32,7 +32,7 @@ void CemuConfig::Load(XMLConfigParser& parser) mlc_path = mlc; permanent_storage = parser.get("permanent_storage", permanent_storage); - + language = parser.get("language", wxLANGUAGE_DEFAULT); use_discord_presence = parser.get("use_discord_presence", true); fullscreen_menubar = parser.get("fullscreen_menubar", false); @@ -103,7 +103,7 @@ void CemuConfig::Load(XMLConfigParser& parser) cemuLog_log(LogType::Force, "config load error: can't load recently launched game file: {}", path); } } - + recent_nfc_files.clear(); auto nfc_parser = parser.get("RecentNFCFiles"); for (auto element = nfc_parser.get("Entry"); element.valid(); element = nfc_parser.get("Entry", element)) @@ -199,7 +199,7 @@ void CemuConfig::Load(XMLConfigParser& parser) { graphic_pack_entries[path].try_emplace("_disabled", "true"); } - + for (auto preset = element.get("Preset"); preset.valid(); preset = element.get("Preset", preset)) { const std::string category = preset.get("category", ""); @@ -207,13 +207,14 @@ void CemuConfig::Load(XMLConfigParser& parser) graphic_pack_entries[path].try_emplace(category, active_preset); } } - + } // graphics auto graphic = parser.get("Graphic"); graphic_api = graphic.get("api", kOpenGL); - graphic.get("device", graphic_device_uuid); + graphic.get("vkDevice", vk_graphic_device_uuid); + mtl_graphic_device_uuid = graphic.get("mtlDevice", 0); vsync = graphic.get("VSync", 0); gx2drawdone_sync = graphic.get("GX2DrawdoneSync", true); upscale_filter = graphic.get("UpscaleFilter", kBicubicHermiteFilter); @@ -336,6 +337,8 @@ void CemuConfig::Load(XMLConfigParser& parser) crash_dump = debug.get("CrashDumpUnix", crash_dump); #endif gdb_port = debug.get("GDBPort", 1337); + gpu_capture_dir = debug.get("GPUCaptureDir", ""); + framebuffer_fetch = debug.get("FramebufferFetch", true); // input auto input = parser.get("Input"); @@ -376,7 +379,7 @@ void CemuConfig::Save(XMLConfigParser& parser) // config.set("cpu_mode", cpu_mode.GetValue()); //config.set("console_region", console_region.GetValue()); config.set("console_language", console_language.GetValue()); - + auto wpos = config.set("window_position"); wpos.set("x", window_position.x); wpos.set("y", window_position.y); @@ -411,13 +414,13 @@ void CemuConfig::Save(XMLConfigParser& parser) { launch_files_parser.set("Entry", entry.c_str()); } - + auto nfc_files_parser = config.set("RecentNFCFiles"); for (const auto& entry : recent_nfc_files) { nfc_files_parser.set("Entry", entry.c_str()); } - + // game paths auto game_path_parser = config.set("GamePaths"); for (const auto& entry : game_paths) @@ -458,11 +461,11 @@ void CemuConfig::Save(XMLConfigParser& parser) entry.set_attribute("disabled", true); continue; } - + auto preset = entry.set("Preset"); if(!kv.first.empty()) preset.set("category", kv.first.c_str()); - + preset.set("preset", kv.second.c_str()); } } @@ -470,7 +473,8 @@ void CemuConfig::Save(XMLConfigParser& parser) // graphics auto graphic = config.set("Graphic"); graphic.set("api", graphic_api); - graphic.set("device", graphic_device_uuid); + graphic.set("vkDevice", vk_graphic_device_uuid); + graphic.set("mtlDevice", mtl_graphic_device_uuid); graphic.set("VSync", vsync); graphic.set("GX2DrawdoneSync", gx2drawdone_sync); //graphic.set("PrecompiledShaders", precompiled_shaders.GetValue()); @@ -537,6 +541,8 @@ void CemuConfig::Save(XMLConfigParser& parser) debug.set("CrashDumpUnix", crash_dump.GetValue()); #endif debug.set("GDBPort", gdb_port); + debug.set("GPUCaptureDir", gpu_capture_dir); + debug.set("FramebufferFetch", framebuffer_fetch); // input auto input = config.set("Input"); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 191614a27..991d9a89e 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -32,7 +32,7 @@ struct GameEntry std::wstring save_folder; std::wstring update_folder; std::wstring dlc_folder; - + uint64 legacy_time_played = 0; uint64 legacy_last_played = 0; @@ -74,6 +74,7 @@ enum GraphicAPI { kOpenGL = 0, kVulkan, + kMetal, }; enum AudioChannels @@ -105,7 +106,7 @@ enum class ScreenPosition kTopRight, kBottomLeft, kBottomCenter, - kBottomRight, + kBottomRight, }; enum class PrecompiledShaderOption @@ -123,6 +124,14 @@ enum class AccurateShaderMulOption }; ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, AccurateShaderMulOption::True); +enum class BufferCacheMode +{ + DevicePrivate, + DeviceShared, + Host, +}; +ENABLE_ENUM_ITERATORS(BufferCacheMode, BufferCacheMode::DevicePrivate, BufferCacheMode::Host); + enum class CPUMode { SinglecoreInterpreter = 0, @@ -134,7 +143,7 @@ enum class CPUMode ENABLE_ENUM_ITERATORS(CPUMode, CPUMode::SinglecoreInterpreter, CPUMode::Auto); -enum class CPUModeLegacy +enum class CPUModeLegacy { SinglecoreInterpreter = 0, SinglecoreRecompiler = 1, @@ -221,6 +230,21 @@ struct fmt::formatter : formatter { } }; template <> +struct fmt::formatter : formatter { + template + auto format(const BufferCacheMode c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case BufferCacheMode::DevicePrivate: name = "device private"; break; + case BufferCacheMode::DeviceShared: name = "device shared"; break; + case BufferCacheMode::Host: name = "host"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> struct fmt::formatter : formatter { template auto format(const CPUMode c, FormatContext &ctx) const { @@ -270,7 +294,7 @@ struct fmt::formatter : formatter { case CafeConsoleRegion::TWN: name = wxTRANSLATE("Taiwan"); break; case CafeConsoleRegion::Auto: name = wxTRANSLATE("Auto"); break; default: name = wxTRANSLATE("many"); break; - + } return formatter::format(name, ctx); } @@ -312,7 +336,7 @@ struct fmt::formatter : formatter { case CrashDump::Lite: name = "Lite"; break; case CrashDump::Full: name = "Full"; break; default: name = "unknown"; break; - + } return formatter::format(name, ctx); } @@ -363,7 +387,7 @@ struct CemuConfig ConfigValue advanced_ppc_logging{ false }; ConfigValue permanent_storage{ true }; - + ConfigValue language{ wxLANGUAGE_DEFAULT }; ConfigValue use_discord_presence{ true }; ConfigValue mlc_path{}; @@ -388,7 +412,7 @@ struct CemuConfig // optimized access std::set game_cache_favorites; // per titleId - + struct _path_hash { std::size_t operator()(const fs::path& path) const { return fs::hash_value(path); @@ -439,9 +463,10 @@ struct CemuConfig // graphics ConfigValue graphic_api{ kVulkan }; - std::array graphic_device_uuid; - ConfigValue vsync{ 0 }; // 0 = off, 1+ = on depending on render backend - ConfigValue gx2drawdone_sync {true}; + std::array vk_graphic_device_uuid; + uint64 mtl_graphic_device_uuid{ 0 }; + ConfigValue vsync{ 0 }; // 0 = off, 1+ = depending on render backend + ConfigValue gx2drawdone_sync { true }; ConfigValue render_upside_down{ false }; ConfigValue async_compile{ true }; @@ -502,6 +527,8 @@ struct CemuConfig // debug ConfigValueBounds crash_dump{ CrashDump::Disabled }; ConfigValue gdb_port{ 1337 }; + ConfigValue gpu_capture_dir{ "" }; + ConfigValue framebuffer_fetch{ true }; void Load(XMLConfigParser& parser); void Save(XMLConfigParser& parser); @@ -516,7 +543,7 @@ struct CemuConfig NetworkService GetAccountNetworkService(uint32 persistentId); void SetAccountSelectedService(uint32 persistentId, NetworkService serviceIndex); - + // emulated usb devices struct { @@ -546,5 +573,3 @@ struct CemuConfig typedef XMLDataConfig XMLCemuConfig_t; extern XMLCemuConfig_t g_config; inline CemuConfig& GetConfig() { return g_config.data(); } - - diff --git a/src/gui/CMakeLists.txt b/src/gui/CMakeLists.txt index 7cdc208eb..7b76ee179 100644 --- a/src/gui/CMakeLists.txt +++ b/src/gui/CMakeLists.txt @@ -1,4 +1,4 @@ -add_library(CemuGui +add_library(CemuGui canvas/IRenderCanvas.h canvas/OpenGLCanvas.cpp canvas/OpenGLCanvas.h @@ -129,6 +129,13 @@ add_library(CemuGui wxHelper.h ) +if(ENABLE_METAL) + target_sources(CemuGui PRIVATE + canvas/MetalCanvas.cpp + canvas/MetalCanvas.h + ) +endif() + set_property(TARGET CemuGui PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/gui/CemuApp.cpp b/src/gui/CemuApp.cpp index c3606292e..cfe436ae0 100644 --- a/src/gui/CemuApp.cpp +++ b/src/gui/CemuApp.cpp @@ -383,7 +383,8 @@ void CemuApp::OnAssertFailure(const wxChar* file, int line, const wxChar* func, #if BOOST_OS_WINDOWS DumpThreadStackTrace(); #endif - cemu_assert_debug(false); + // HACK + //cemu_assert_debug(false); } int CemuApp::FilterEvent(wxEvent& event) @@ -567,5 +568,3 @@ void CemuApp::ActivateApp(wxActivateEvent& event) g_window_info.app_active = event.GetActive(); event.Skip(); } - - diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index f15395e42..c46f0f254 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -8,6 +8,7 @@ #include #include +#include "config/CemuConfig.h" #include "gui/helpers/wxHelpers.h" #include "input/InputManager.h" @@ -61,7 +62,7 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) const sint32 m_cpu_modeNChoices = std::size(cpu_modes); m_cpu_mode = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, m_cpu_modeNChoices, cpu_modes, 0); m_cpu_mode->SetToolTip(_("Set the CPU emulation mode")); - first_row->Add(m_cpu_mode, 0, wxALL, 5); + first_row->Add(m_cpu_mode, 0, wxALL, 5); first_row->Add(new wxStaticText(box, wxID_ANY, _("Thread quantum")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); @@ -112,10 +113,14 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) first_row->Add(new wxStaticText(panel, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString gapi_values[] = { "", "OpenGL", "Vulkan" }; + wxString gapi_values[] = { "", "OpenGL", "Vulkan", +#if ENABLE_METAL + "Metal" +#endif + }; m_graphic_api = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(gapi_values), gapi_values); first_row->Add(m_graphic_api, 0, wxALL, 5); - + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Shader multiplication accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString mul_values[] = { _("false"), _("true")}; @@ -123,6 +128,27 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Fast math")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString math_values[] = { _("false"), _("true") }; + m_fast_math = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(math_values), math_values); + m_fast_math->SetToolTip(_("Enables fast math for all shaders. May (rarely) cause graphical bugs.\n\nMetal only\n\nRecommended: true")); + first_row->Add(m_fast_math, 0, wxALL, 5); + + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache mode")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString cache_values[] = { _("device private"), _("device shared"), _("host") }; + m_buffer_cache_mode = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); + m_buffer_cache_mode->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); + first_row->Add(m_buffer_cache_mode, 0, wxALL, 5); + + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Position invariance")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString pos_values[] = { _("false"), _("true") }; + m_position_invariance = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(pos_values), pos_values); + m_position_invariance->SetToolTip(_("Disables most optimizations for vertex positions. May fix polygon cutouts in some games.\n\nMetal only\n\nRecommended: false")); + first_row->Add(m_position_invariance, 0, wxALL, 5); + /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; m_cache_accuracy = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(accuarcy_values), accuarcy_values); @@ -249,7 +275,7 @@ void GameProfileWindow::ApplyProfile() // general m_load_libs->SetValue(m_game_profile.m_loadSharedLibraries.value()); m_start_with_padview->SetValue(m_game_profile.m_startWithPadView); - + // cpu // wxString cpu_modes[] = { _("Singlecore-Interpreter"), _("Singlecore-Recompiler"), _("Triplecore-Recompiler"), _("Auto (recommended)") }; switch(m_game_profile.m_cpuMode.value()) @@ -258,24 +284,27 @@ void GameProfileWindow::ApplyProfile() case CPUMode::SinglecoreRecompiler: m_cpu_mode->SetSelection(1); break; case CPUMode::DualcoreRecompiler: m_cpu_mode->SetSelection(2); break; case CPUMode::MulticoreRecompiler: m_cpu_mode->SetSelection(2); break; - default: m_cpu_mode->SetSelection(3); + default: m_cpu_mode->SetSelection(3); } - + m_thread_quantum->SetStringSelection(fmt::format("{}", m_game_profile.m_threadQuantum)); // gpu if (!m_game_profile.m_graphics_api.has_value()) m_graphic_api->SetSelection(0); // selecting "" else - m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan + m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); + m_fast_math->SetSelection((int)m_game_profile.m_fastMath); + m_buffer_cache_mode->SetSelection((int)m_game_profile.m_bufferCacheMode); + m_position_invariance->SetSelection((int)m_game_profile.m_positionInvariance); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); // controller auto profiles = InputManager::get_profiles(); - + for (const auto& cb : m_controller_profile) { cb->Clear(); @@ -293,7 +322,7 @@ void GameProfileWindow::ApplyProfile() const auto& v = m_game_profile.m_controllerProfile[i].value(); m_controller_profile[i]->SetStringSelection(wxString::FromUTF8(v)); } - + else m_controller_profile[i]->SetSelection(wxNOT_FOUND); } @@ -317,7 +346,7 @@ void GameProfileWindow::SaveProfile() m_game_profile.m_cpuMode = CPUMode::Auto; } - + const wxString thread_quantum = m_thread_quantum->GetStringSelection(); if (!thread_quantum.empty()) { @@ -328,13 +357,16 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); + m_game_profile.m_fastMath = (bool)m_fast_math->GetSelection(); + m_game_profile.m_bufferCacheMode = (BufferCacheMode)m_buffer_cache_mode->GetSelection(); + m_game_profile.m_positionInvariance = (bool)m_position_invariance->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value if (m_graphic_api->GetSelection() == 0) m_game_profile.m_graphics_api = {}; else - m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan + m_game_profile.m_graphics_api = (GraphicAPI)(m_graphic_api->GetSelection() - 1); // "", OpenGL, Vulkan, Metal // controller for (int i = 0; i < 8; ++i) @@ -365,4 +397,4 @@ void GameProfileWindow::SetSliderValue(wxSlider* slider, sint32 new_value) const slider_event.SetEventObject(slider); slider_event.SetClientData((void*)IsFrozen()); wxPostEvent(slider->GetEventHandler(), slider_event); -} \ No newline at end of file +} diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 6ca36de68..ddd72c775 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,6 +40,9 @@ class GameProfileWindow : public wxFrame wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; + wxChoice* m_fast_math; + wxChoice* m_buffer_cache_mode; + wxChoice* m_position_invariance; //wxChoice* m_cache_accuracy; // audio @@ -47,4 +50,4 @@ class GameProfileWindow : public wxFrame // controller wxComboBox* m_controller_profile[8]; -}; \ No newline at end of file +}; diff --git a/src/gui/GeneralSettings2.cpp b/src/gui/GeneralSettings2.cpp index 9b7632295..31d164819 100644 --- a/src/gui/GeneralSettings2.cpp +++ b/src/gui/GeneralSettings2.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,9 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" +#if ENABLE_METAL +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#endif #include "Cafe/Account/Account.h" #include @@ -93,6 +97,19 @@ class wxVulkanUUID : public wxClientData VulkanRenderer::DeviceInfo m_device_info; }; +#if ENABLE_METAL +class wxMetalUUID : public wxClientData +{ +public: + wxMetalUUID(const MetalRenderer::DeviceInfo& info) + : m_device_info(info) {} + const MetalRenderer::DeviceInfo& GetDeviceInfo() const { return m_device_info; } + +private: + MetalRenderer::DeviceInfo m_device_info; +}; +#endif + class wxAccountData : public wxClientData { public: @@ -101,7 +118,7 @@ class wxAccountData : public wxClientData Account& GetAccount() { return m_account; } const Account& GetAccount() const { return m_account; } - + private: Account m_account; }; @@ -311,12 +328,14 @@ wxPanel* GeneralSettings2::AddGraphicsPage(wxNotebook* notebook) row->Add(new wxStaticText(box, wxID_ANY, _("Graphics API")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); sint32 api_size = 1; - wxString choices[2] = { "OpenGL" }; + wxString choices[3] = { "OpenGL" }; if (g_vulkan_available) { - choices[1] = "Vulkan"; - api_size = 2; + choices[api_size++] = "Vulkan"; } +#if ENABLE_METAL + choices[api_size++] = "Metal"; +#endif m_graphic_api = new wxChoice(box, wxID_ANY, wxDefaultPosition, wxDefaultSize, api_size, choices); m_graphic_api->SetSelection(0); @@ -763,7 +782,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) auto* row = new wxFlexGridSizer(0, 2, 0, 0); row->SetFlexibleDirection(wxBOTH); row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); - + const wxImage tmp = wxBITMAP_PNG_FROM_DATA(PNG_ERROR).ConvertToImage(); m_validate_online = new wxBitmapButton(box, wxID_ANY, tmp.Scale(16, 16)); m_validate_online->Bind(wxEVT_BUTTON, &GeneralSettings2::OnShowOnlineValidator, this); @@ -773,7 +792,7 @@ wxPanel* GeneralSettings2::AddAccountPage(wxNotebook* notebook) row->Add(m_online_status, 1, wxALL | wxALIGN_CENTRE_VERTICAL, 5); box_sizer->Add(row, 1, wxEXPAND, 5); - + auto* tutorial_link = new wxHyperlinkCtrl(box, wxID_ANY, _("Online play tutorial"), "https://cemu.info/online-guide"); box_sizer->Add(tutorial_link, 0, wxALL, 5); @@ -876,6 +895,33 @@ wxPanel* GeneralSettings2::AddDebugPage(wxNotebook* notebook) debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); } + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + debug_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU capture save directory"), wxDefaultPosition, wxDefaultSize, 0), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + m_gpu_capture_dir = new wxTextCtrl(panel, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, wxTE_DONTWRAP); + m_gpu_capture_dir->SetMinSize(wxSize(150, -1)); + m_gpu_capture_dir->SetToolTip(_("Cemu will save the GPU captures done by selecting Debug -> GPU capture in the menu bar in this directory. If a debugger with support for GPU captures (like Xcode) is attached, the capture will be opened in that debugger instead. If such debugger is not attached, METAL_CAPTURE_ENABLED must be set to 1 as an environment variable.")); + + debug_row->Add(m_gpu_capture_dir, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + + { + auto* debug_row = new wxFlexGridSizer(0, 2, 0, 0); + debug_row->SetFlexibleDirection(wxBOTH); + debug_row->SetNonFlexibleGrowMode(wxFLEX_GROWMODE_SPECIFIED); + + m_framebuffer_fetch = new wxCheckBox(panel, wxID_ANY, _("Framebuffer fetch")); + m_framebuffer_fetch->SetToolTip(_("Enable framebuffer fetch for eligible textures on supported devices.")); + + debug_row->Add(m_framebuffer_fetch, 0, wxALL | wxEXPAND, 5); + debug_panel_sizer->Add(debug_row, 0, wxALL | wxEXPAND, 5); + } + panel->SetSizerAndFit(debug_panel_sizer); return panel; @@ -891,14 +937,14 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) notebook->AddPage(AddGeneralPage(notebook), _("General")); notebook->AddPage(AddGraphicsPage(notebook), _("Graphics")); - notebook->AddPage(AddAudioPage(notebook), _("Audio")); + notebook->AddPage(AddAudioPage(notebook), _("Audio")); notebook->AddPage(AddOverlayPage(notebook), _("Overlay")); notebook->AddPage(AddAccountPage(notebook), _("Account")); notebook->AddPage(AddDebugPage(notebook), _("Debug")); Bind(wxEVT_CLOSE_WINDOW, &GeneralSettings2::OnClose, this); - // + // sizer->Add(notebook, 1, wxEXPAND | wxALL, 5); @@ -913,7 +959,7 @@ GeneralSettings2::GeneralSettings2(wxWindow* parent, bool game_launched) ApplyConfig(); HandleGraphicsApiSelection(); - + DisableSettings(game_launched); } @@ -925,7 +971,7 @@ uint32 GeneralSettings2::GetSelectedAccountPersistentId() return dynamic_cast(m_active_account->GetClientObject(active_account))->GetAccount().GetPersistentId(); } -void GeneralSettings2::StoreConfig() +void GeneralSettings2::StoreConfig() { auto* app = (CemuApp*)wxTheApp; auto& config = GetConfig(); @@ -946,7 +992,6 @@ void GeneralSettings2::StoreConfig() ScreenSaver::SetInhibit(config.disable_screensaver); } - // -1 is default wx widget value -> set to dummy 0 so mainwindow and padwindow will update it config.window_position = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; config.window_size = m_save_window_position_size->IsChecked() ? Vector2i{ 0,0 } : Vector2i{-1,-1}; @@ -989,7 +1034,7 @@ void GeneralSettings2::StoreConfig() config.pad_channels = kStereo; // (AudioChannels)m_pad_channels->GetSelection(); //config.input_channels = (AudioChannels)m_input_channels->GetSelection(); config.input_channels = kMono; // (AudioChannels)m_input_channels->GetSelection(); - + config.tv_volume = m_tv_volume->GetValue(); config.pad_volume = m_pad_volume->GetValue(); config.input_volume = m_input_volume->GetValue(); @@ -1025,26 +1070,42 @@ void GeneralSettings2::StoreConfig() config.graphic_api = (GraphicAPI)m_graphic_api->GetSelection(); selection = m_graphic_device->GetSelection(); - if(selection != wxNOT_FOUND) - { - const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); - if(info) - config.graphic_device_uuid = info->GetDeviceInfo().uuid; - else - config.graphic_device_uuid = {}; + if (config.graphic_api == GraphicAPI::kVulkan) + { + if (selection != wxNOT_FOUND) + { + const auto* info = (wxVulkanUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.vk_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.vk_graphic_device_uuid = {}; + } + else + config.vk_graphic_device_uuid = {}; + } + else if (config.graphic_api == GraphicAPI::kMetal) + { + if (selection != wxNOT_FOUND) + { + const auto* info = (wxMetalUUID*)m_graphic_device->GetClientObject(selection); + if (info) + config.mtl_graphic_device_uuid = info->GetDeviceInfo().uuid; + else + config.mtl_graphic_device_uuid = {}; + } + else + config.mtl_graphic_device_uuid = {}; } - else - config.graphic_device_uuid = {}; - + config.vsync = m_vsync->GetSelection(); config.gx2drawdone_sync = m_gx2drawdone_sync->IsChecked(); config.async_compile = m_async_compile->IsChecked(); - + config.upscale_filter = m_upscale_filter->GetSelection(); config.downscale_filter = m_downscale_filter->GetSelection(); config.fullscreen_scaling = m_fullscreen_scaling->GetSelection(); - + config.overlay.position = (ScreenPosition)m_overlay_position->GetSelection(); wxASSERT((int)config.overlay.position <= (int)ScreenPosition::kBottomRight); config.overlay.text_color = m_overlay_font_color->GetColour().GetRGBA(); config.overlay.text_scale = m_overlay_scale->GetSelection() * 25 + 50; @@ -1071,6 +1132,8 @@ void GeneralSettings2::StoreConfig() // debug config.crash_dump = (CrashDump)m_crash_dump->GetSelection(); config.gdb_port = m_gdb_port->GetValue(); + config.gpu_capture_dir = m_gpu_capture_dir->GetValue().utf8_string(); + config.framebuffer_fetch = m_framebuffer_fetch->IsChecked(); g_config.Save(); } @@ -1102,7 +1165,7 @@ void GeneralSettings2::ValidateConfig() void GeneralSettings2::DisableSettings(bool game_launched) { - + } void GeneralSettings2::OnAudioLatencyChanged(wxCommandEvent& event) @@ -1113,7 +1176,7 @@ void GeneralSettings2::OnAudioLatencyChanged(wxCommandEvent& event) void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) { - + if(event.GetEventObject() == m_input_volume) { std::shared_lock lock(g_audioInputMutex); @@ -1137,7 +1200,7 @@ void GeneralSettings2::OnVolumeChanged(wxCommandEvent& event) g_tvAudio->SetVolume(event.GetInt()); } } - + event.Skip(); } @@ -1150,7 +1213,7 @@ void GeneralSettings2::OnInputVolumeChanged(wxCommandEvent& event) g_padAudio->SetInputVolume(event.GetInt()); g_padVolume = event.GetInt(); } - + event.Skip(); } @@ -1228,7 +1291,7 @@ void GeneralSettings2::UpdateAudioDeviceList() // todo reset global instance of audio device } -void GeneralSettings2::ResetAccountInformation() +void GeneralSettings2::ResetAccountInformation() { m_account_grid->SetSplitterPosition(100); m_active_account->SetSelection(0); @@ -1256,7 +1319,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) Account account(dialog.GetPersistentId(), dialog.GetMiiName().ToStdWstring()); account.Save(); Account::RefreshAccounts(); - + const int index = m_active_account->Append(account.ToString(), new wxAccountData(account)); // update ui @@ -1265,7 +1328,7 @@ void GeneralSettings2::OnAccountCreate(wxCommandEvent& event) m_create_account->Enable(m_active_account->GetCount() < 0xC); m_delete_account->Enable(m_active_account->GetCount() > 1); - + // send main window event wxASSERT(GetParent()); wxCommandEvent refresh_event(wxEVT_ACCOUNTLIST_REFRESH); @@ -1295,7 +1358,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) return; // todo: ask if saves should be deleted too? - + const fs::path path = account.GetFileName(); try { @@ -1313,7 +1376,7 @@ void GeneralSettings2::OnAccountDelete(wxCommandEvent& event) SystemException sys(ex); cemuLog_log(LogType::Force, sys.what()); } - + } void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) @@ -1368,7 +1431,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) else if (property->GetName() == kPropertyEmail) { account.SetEmail(value.As().ToStdString()); - + } else if (property->GetName() == kPropertyCountry) { @@ -1376,7 +1439,7 @@ void GeneralSettings2::OnAccountSettingsChanged(wxPropertyGridEvent& event) } else cemu_assert_debug(false); - + account.Save(); Account::RefreshAccounts(); // refresh internal account list UpdateAccountInformation(); // refresh on invalid values @@ -1416,7 +1479,7 @@ void GeneralSettings2::UpdateAccountInformation() gender_property->SetChoiceSelection(std::min(gender_property->GetChoices().GetCount() - 1, (uint32)account.GetGender())); m_account_grid->GetProperty(kPropertyEmail)->SetValueFromString(std::string{ account.GetEmail() }); - + auto* country_property = dynamic_cast(m_account_grid->GetProperty(kPropertyCountry)); wxASSERT(country_property); int index = (country_property)->GetIndexForValue(account.GetCountry()); @@ -1500,9 +1563,9 @@ void GeneralSettings2::HandleGraphicsApiSelection() int selection = m_vsync->GetSelection(); if(selection == wxNOT_FOUND) selection = GetConfig().vsync; - + m_vsync->Clear(); - if(m_graphic_api->GetSelection() == 0) + if (m_graphic_api->GetSelection() == 0) { // OpenGL m_vsync->AppendString(_("Off")); @@ -1518,7 +1581,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() m_gx2drawdone_sync->Enable(); m_async_compile->Disable(); } - else + else if (m_graphic_api->GetSelection() == 1) { // Vulkan m_gx2drawdone_sync->Disable(); @@ -1532,7 +1595,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() #endif m_vsync->Select(selection); - + m_graphic_device->Enable(); auto devices = VulkanRenderer::GetDevices(); m_graphic_device->Clear(); @@ -1547,7 +1610,7 @@ void GeneralSettings2::HandleGraphicsApiSelection() const auto& config = GetConfig(); for(size_t i = 0; i < devices.size(); ++i) { - if(config.graphic_device_uuid == devices[i].uuid) + if(config.vk_graphic_device_uuid == devices[i].uuid) { m_graphic_device->SetSelection(i); break; @@ -1555,6 +1618,43 @@ void GeneralSettings2::HandleGraphicsApiSelection() } } } + else + { + // Metal + m_gx2drawdone_sync->Disable(); + m_async_compile->Enable(); + + // TODO: vsync options + m_vsync->AppendString(_("Off")); + m_vsync->AppendString(_("Double buffering")); + m_vsync->AppendString(_("Triple buffering")); + + m_vsync->Select(selection); + + m_graphic_device->Enable(); + auto devices = MetalRenderer::GetDevices(); + m_graphic_device->Clear(); +#if ENABLE_METAL + if(!devices.empty()) + { + for (const auto& device : devices) + { + m_graphic_device->Append(device.name, new wxMetalUUID(device)); + } + m_graphic_device->SetSelection(0); + + const auto& config = GetConfig(); + for (size_t i = 0; i < devices.size(); ++i) + { + if (config.mtl_graphic_device_uuid == devices[i].uuid) + { + m_graphic_device->SetSelection(i); + break; + } + } + } +#endif + } } void GeneralSettings2::ApplyConfig() @@ -1658,7 +1758,7 @@ void GeneralSettings2::ApplyConfig() m_pad_channels->SetSelection(0); //m_input_channels->SetSelection(config.pad_channels); m_input_channels->SetSelection(0); - + SendSliderEvent(m_tv_volume, config.tv_volume); if (!config.tv_device.empty() && m_tv_device->HasClientObjectData()) @@ -1675,7 +1775,7 @@ void GeneralSettings2::ApplyConfig() } else m_tv_device->SetSelection(0); - + SendSliderEvent(m_pad_volume, config.pad_volume); if (!config.pad_device.empty() && m_pad_device->HasClientObjectData()) { @@ -1728,6 +1828,8 @@ void GeneralSettings2::ApplyConfig() // debug m_crash_dump->SetSelection((int)config.crash_dump.GetValue()); m_gdb_port->SetValue(config.gdb_port.GetValue()); + m_gpu_capture_dir->SetValue(wxHelper::FromUtf8(config.gpu_capture_dir.GetValue())); + m_framebuffer_fetch->SetValue(config.framebuffer_fetch); } void GeneralSettings2::OnAudioAPISelected(wxCommandEvent& event) @@ -1795,7 +1897,7 @@ void GeneralSettings2::UpdateAudioDevice() } } } - + // pad audio device { const auto selection = m_pad_device->GetSelection(); @@ -1885,14 +1987,14 @@ void GeneralSettings2::OnAudioChannelsSelected(wxCommandEvent& event) { if (config.tv_channels == (AudioChannels)obj->GetSelection()) return; - + config.tv_channels = (AudioChannels)obj->GetSelection(); } else if (obj == m_pad_channels) { if (config.pad_channels == (AudioChannels)obj->GetSelection()) return; - + config.pad_channels = (AudioChannels)obj->GetSelection(); } else @@ -2035,23 +2137,23 @@ void GeneralSettings2::OnShowOnlineValidator(wxCommandEvent& event) const auto selection = m_active_account->GetSelection(); if (selection == wxNOT_FOUND) return; - + const auto* obj = dynamic_cast(m_active_account->GetClientObject(selection)); wxASSERT(obj); const auto& account = obj->GetAccount(); - + const auto validator = account.ValidateOnlineFiles(); if (validator) // everything valid? shouldn't happen return; - + wxString err; err << _("The following error(s) have been found:") << '\n'; - + if (validator.otp == OnlineValidator::FileState::Missing) err << _("otp.bin missing in Cemu directory") << '\n'; else if(validator.otp == OnlineValidator::FileState::Corrupted) err << _("otp.bin is invalid") << '\n'; - + if (validator.seeprom == OnlineValidator::FileState::Missing) err << _("seeprom.bin missing in Cemu directory") << '\n'; else if(validator.seeprom == OnlineValidator::FileState::Corrupted) diff --git a/src/gui/GeneralSettings2.h b/src/gui/GeneralSettings2.h index 7fbfecc10..58459e958 100644 --- a/src/gui/GeneralSettings2.h +++ b/src/gui/GeneralSettings2.h @@ -28,7 +28,7 @@ class GeneralSettings2 : public wxDialog bool m_has_account_change = false; // keep track of dirty state of accounts - + wxPanel* AddGeneralPage(wxNotebook* notebook); wxPanel* AddGraphicsPage(wxNotebook* notebook); wxPanel* AddAudioPage(wxNotebook* notebook); @@ -79,6 +79,8 @@ class GeneralSettings2 : public wxDialog // Debug wxChoice* m_crash_dump; wxSpinCtrl* m_gdb_port; + wxTextCtrl* m_gpu_capture_dir; + wxCheckBox* m_framebuffer_fetch; void OnAccountCreate(wxCommandEvent& event); void OnAccountDelete(wxCommandEvent& event); @@ -107,11 +109,10 @@ class GeneralSettings2 : public wxDialog void UpdateAudioDevice(); // refreshes audio device list for dropdown void UpdateAudioDeviceList(); - + void ResetAccountInformation(); void UpdateAccountInformation(); void UpdateOnlineAccounts(); void HandleGraphicsApiSelection(); void ApplyConfig(); }; - diff --git a/src/gui/LoggingWindow.cpp b/src/gui/LoggingWindow.cpp index 4026113e7..0d25e2796 100644 --- a/src/gui/LoggingWindow.cpp +++ b/src/gui/LoggingWindow.cpp @@ -21,7 +21,7 @@ LoggingWindow::LoggingWindow(wxFrame* parent) filter_row->Add(new wxStaticText( this, wxID_ANY, _("Filter")), 0, wxALIGN_CENTER_VERTICAL|wxALL, 5 ); - wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer"}; + wxString choices[] = {"Unsupported APIs calls", "Coreinit Logging", "Coreinit File-Access", "Coreinit Thread-Synchronization", "Coreinit Memory", "Coreinit MP", "Coreinit Thread", "nn::nfp", "GX2", "Audio", "Input", "Socket", "Save", "H264", "Graphic pack patches", "Texture cache", "Texture readback", "OpenGL debug output", "Vulkan validation layer", "Metal debug output"}; m_filter = new wxComboBox( this, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, std::size(choices), choices, 0 ); m_filter->Bind(wxEVT_COMBOBOX, &LoggingWindow::OnFilterChange, this); m_filter->Bind(wxEVT_TEXT, &LoggingWindow::OnFilterChange, this); @@ -83,7 +83,7 @@ void LoggingWindow::Log(std::string_view filter, std::wstring_view message) void LoggingWindow::OnLogMessage(wxLogEvent& event) { - m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); + m_log_list->PushEntry(event.GetFilter(), event.GetMessage()); } void LoggingWindow::OnFilterChange(wxCommandEvent& event) @@ -97,4 +97,3 @@ void LoggingWindow::OnFilterMessageChange(wxCommandEvent& event) m_log_list->SetFilterMessage(m_filter_message->GetValue()); event.Skip(); } - diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index 4801706a9..1895bb886 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -1,3 +1,5 @@ +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "Cafe/HW/Latte/Renderer/Renderer.h" #include "gui/wxgui.h" #include "gui/MainWindow.h" #include "gui/guiWrapper.h" @@ -12,6 +14,7 @@ #include "audio/audioDebuggerWindow.h" #include "gui/canvas/OpenGLCanvas.h" #include "gui/canvas/VulkanCanvas.h" +#include "gui/canvas/MetalCanvas.h" #include "Cafe/OS/libs/nfc/nfc.h" #include "Cafe/OS/libs/swkbd/swkbd.h" #include "gui/debugger/DebuggerWindow2.h" @@ -93,7 +96,7 @@ enum // options -> account MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_1 = 20350, MAINFRAME_MENU_ID_OPTIONS_ACCOUNT_12 = 20350 + 11, - + // options -> system language MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_JAPANESE = 20500, MAINFRAME_MENU_ID_OPTIONS_LANGUAGE_ENGLISH, @@ -136,6 +139,7 @@ enum MAINFRAME_MENU_ID_DEBUG_VIEW_TEXTURE_RELATIONS, MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, + MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, // debug->logging MAINFRAME_MENU_ID_DEBUG_LOGGING0 = 21500, @@ -211,6 +215,7 @@ EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_CURL_REQUESTS, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, MainWindow::OnDebugSetting) +EVT_MENU(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_RAM, MainWindow::OnDebugSetting) EVT_MENU(MAINFRAME_MENU_ID_DEBUG_DUMP_FST, MainWindow::OnDebugSetting) // debug -> View ... @@ -243,7 +248,7 @@ class wxGameDropTarget : public wxFileDropTarget { if(!m_window->IsGameLaunched() && filenames.GetCount() == 1) return m_window->FileLoad(_utf8ToPath(filenames[0].utf8_string()), wxLaunchGameEvent::INITIATED_BY::DRAG_AND_DROP); - + return false; } @@ -455,7 +460,7 @@ bool MainWindow::InstallUpdate(const fs::path& metaFilePath) { throw std::runtime_error(frame.GetExceptionMessage()); } - } + } } catch(const AbortException&) { @@ -639,13 +644,13 @@ void MainWindow::OnFileMenu(wxCommandEvent& event) _("Wii U executable (*.rpx, *.elf)"), _("All files (*.*)") ); - + wxFileDialog openFileDialog(this, _("Open file to launch"), wxEmptyString, wxEmptyString, wildcard, wxFD_OPEN | wxFD_FILE_MUST_EXIST); if (openFileDialog.ShowModal() == wxID_CANCEL || openFileDialog.GetPath().IsEmpty()) return; - const wxString wxStrFilePath = openFileDialog.GetPath(); + const wxString wxStrFilePath = openFileDialog.GetPath(); FileLoad(_utf8ToPath(wxStrFilePath.utf8_string()), wxLaunchGameEvent::INITIATED_BY::MENU); } else if (menuId >= MAINFRAME_MENU_ID_FILE_RECENT_0 && menuId <= MAINFRAME_MENU_ID_FILE_RECENT_LAST) @@ -784,7 +789,7 @@ void MainWindow::TogglePadView() { if (m_padView) return; - + m_padView = new PadViewFrame(this); m_padView->Bind(wxEVT_CLOSE_WINDOW, &MainWindow::OnPadClose, this); @@ -992,7 +997,7 @@ void MainWindow::OnConsoleLanguage(wxCommandEvent& event) // GetConfig().cpu_mode = CPUMode::TriplecoreRecompiler; // else // cemu_assert_debug(false); -// +// // g_config.Save(); //} @@ -1006,6 +1011,14 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) if(!GetConfig().vk_accurate_barriers) wxMessageBox(_("Warning: Disabling the accurate barriers option will lead to flickering graphics but may improve performance. It is highly recommended to leave it turned on."), _("Accurate barriers are off"), wxOK); } + else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE) + { + cemu_assert_debug(g_renderer->GetType() == RendererAPI::Metal); + +#if ENABLE_METAL + static_cast(g_renderer.get())->CaptureFrame(); +#endif + } else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_AUDIO_AUX_ONLY) ActiveSettings::EnableAudioOnlyAux(event.IsChecked()); else if (event.GetId() == MAINFRAME_MENU_ID_DEBUG_DUMP_RAM) @@ -1056,7 +1069,7 @@ void MainWindow::OnDebugSetting(wxCommandEvent& event) ActiveSettings::SetTimerShiftFactor(6); else cemu_assert_debug(false); - + g_config.Save(); } @@ -1130,7 +1143,7 @@ void MainWindow::OnLoggingWindow(wxCommandEvent& event) return; m_logging_window = new LoggingWindow(this); - m_logging_window->Bind(wxEVT_CLOSE_WINDOW, + m_logging_window->Bind(wxEVT_CLOSE_WINDOW, [this](wxCloseEvent& event) { m_logging_window = nullptr; event.Skip(); @@ -1305,7 +1318,7 @@ void MainWindow::SaveSettings() { auto lock = g_config.Lock(); auto& config = GetConfig(); - + if (config.window_position != Vector2i{ -1,-1 }) { config.window_position.x = m_restored_position.x; @@ -1342,7 +1355,7 @@ void MainWindow::SaveSettings() if(m_game_list) m_game_list->SaveConfig(); - + g_config.Save(); } @@ -1372,14 +1385,14 @@ void MainWindow::OnMouseMove(wxMouseEvent& event) void MainWindow::OnMouseLeft(wxMouseEvent& event) { auto& instance = InputManager::instance(); - + std::scoped_lock lock(instance.m_main_mouse.m_mutex); instance.m_main_mouse.left_down = event.ButtonDown(wxMOUSE_BTN_LEFT); auto physPos = ToPhys(event.GetPosition()); instance.m_main_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_main_mouse.left_down_toggle = true; - + event.Skip(); } @@ -1393,7 +1406,7 @@ void MainWindow::OnMouseRight(wxMouseEvent& event) instance.m_main_mouse.position = { physPos.x, physPos.y }; if(event.ButtonDown(wxMOUSE_BTN_RIGHT)) instance.m_main_mouse.right_down_toggle = true; - + event.Skip(); } @@ -1441,7 +1454,7 @@ void MainWindow::OnKeyUp(wxKeyEvent& event) void MainWindow::OnKeyDown(wxKeyEvent& event) { - if ((event.AltDown() && event.GetKeyCode() == WXK_F4) || + if ((event.AltDown() && event.GetKeyCode() == WXK_F4) || (event.CmdDown() && event.GetKeyCode() == 'Q')) { Close(true); @@ -1456,7 +1469,7 @@ void MainWindow::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + // event.Skip(); } @@ -1481,7 +1494,7 @@ void MainWindow::OnToolsInput(wxCommandEvent& event) case MAINFRAME_MENU_ID_TOOLS_DOWNLOAD_MANAGER: { const auto default_tab = id == MAINFRAME_MENU_ID_TOOLS_TITLE_MANAGER ? TitleManagerPage::TitleManager : TitleManagerPage::DownloadManager; - + if (m_title_manager) m_title_manager->SetFocusAndTab(default_tab); else @@ -1531,7 +1544,7 @@ void MainWindow::OnGesturePan(wxPanGestureEvent& event) instance.m_main_touch.left_down = event.IsGestureStart() || !event.IsGestureEnd(); if (event.IsGestureStart() || !event.IsGestureEnd()) instance.m_main_touch.left_down_toggle = true; - + event.Skip(); } @@ -1565,8 +1578,12 @@ void MainWindow::CreateCanvas() // create canvas if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(m_game_panel, wxSize(1280, 720), true); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(m_game_panel, wxSize(1280, 720), true); +#if ENABLE_METAL + else + m_render_canvas = new MetalCanvas(m_game_panel, wxSize(1280, 720), true); +#endif // mouse events m_render_canvas->Bind(wxEVT_MOTION, &MainWindow::OnMouseMove, this); @@ -1746,10 +1763,10 @@ void MainWindow::UpdateNFCMenu() const auto& entry = config.recent_nfc_files[i]; if (entry.empty()) continue; - + if (!fs::exists(_utf8ToPath(entry))) continue; - + if (recentFileIndex == 0) m_nfcMenuSeparator0 = m_nfcMenu->AppendSeparator(); @@ -1800,7 +1817,7 @@ void MainWindow::OnTimer(wxTimerEvent& event) { ShowCursor(false); } - + } #define BUILD_DATE __DATE__ " " __TIME__ @@ -2059,9 +2076,9 @@ void MainWindow::RecreateMenu() m_menuBar->Destroy(); m_menuBar = nullptr; } - + auto& config = GetConfig(); - + m_menuBar = new wxMenuBar(); // file submenu m_fileMenu = new wxMenu(); @@ -2113,7 +2130,7 @@ void MainWindow::RecreateMenu() item->Check(account_id == account.GetPersistentId()); if (m_game_launched || LaunchSettings::GetPersistentId().has_value()) item->Enable(false); - + ++index; } @@ -2143,8 +2160,8 @@ void MainWindow::RecreateMenu() // options submenu wxMenu* optionsMenu = new wxMenu(); m_fullscreenMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_FULLSCREEN, _("&Fullscreen"), wxEmptyString); - m_fullscreenMenuItem->Check(ActiveSettings::FullscreenEnabled()); - + m_fullscreenMenuItem->Check(ActiveSettings::FullscreenEnabled()); + optionsMenu->Append(MAINFRAME_MENU_ID_OPTIONS_GRAPHIC_PACKS2, _("&Graphic packs")); m_padViewMenuItem = optionsMenu->AppendCheckItem(MAINFRAME_MENU_ID_OPTIONS_SECOND_WINDOW_PADVIEW, _("&Separate GamePad view"), wxEmptyString); m_padViewMenuItem->Check(GetConfig().pad_open); @@ -2238,7 +2255,7 @@ void MainWindow::RecreateMenu() debugMenu->AppendSubMenu(debugLoggingMenu, _("&Logging")); debugMenu->AppendSubMenu(debugDumpMenu, _("&Dump")); debugMenu->AppendSeparator(); - + auto upsidedownItem = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_RENDER_UPSIDE_DOWN, _("&Render upside-down"), wxEmptyString); upsidedownItem->Check(ActiveSettings::RenderUpsideDownEnabled()); if(LaunchSettings::RenderUpsideDownEnabled().has_value()) @@ -2247,6 +2264,9 @@ void MainWindow::RecreateMenu() auto accurateBarriers = debugMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, _("&Accurate barriers (Vulkan)"), wxEmptyString); accurateBarriers->Check(GetConfig().vk_accurate_barriers); + auto gpuCapture = debugMenu->Append(MAINFRAME_MENU_ID_DEBUG_GPU_CAPTURE, _("&GPU capture (Metal)")); + gpuCapture->Enable(m_game_launched && g_renderer->GetType() == RendererAPI::Metal); + debugMenu->AppendSeparator(); #ifdef CEMU_DEBUG_ASSERT diff --git a/src/gui/PadViewFrame.cpp b/src/gui/PadViewFrame.cpp index e7cc5c185..94319299e 100644 --- a/src/gui/PadViewFrame.cpp +++ b/src/gui/PadViewFrame.cpp @@ -8,6 +8,7 @@ #include "Cafe/OS/libs/swkbd/swkbd.h" #include "gui/canvas/OpenGLCanvas.h" #include "gui/canvas/VulkanCanvas.h" +#include "gui/canvas/MetalCanvas.h" #include "config/CemuConfig.h" #include "gui/MainWindow.h" #include "gui/helpers/wxHelpers.h" @@ -74,8 +75,12 @@ void PadViewFrame::InitializeRenderCanvas() { if (ActiveSettings::GetGraphicsAPI() == kVulkan) m_render_canvas = new VulkanCanvas(this, wxSize(854, 480), false); - else + else if (ActiveSettings::GetGraphicsAPI() == kOpenGL) m_render_canvas = GLCanvas_Create(this, wxSize(854, 480), false); +#if ENABLE_METAL + else + m_render_canvas = new MetalCanvas(this, wxSize(854, 480), false); +#endif sizer->Add(m_render_canvas, 1, wxEXPAND, 0, nullptr); } SetSizer(sizer); @@ -173,7 +178,7 @@ void PadViewFrame::OnChar(wxKeyEvent& event) { if (swkbd_hasKeyboardInputHook()) swkbd_keyInput(event.GetUnicodeKey()); - + event.Skip(); } @@ -198,7 +203,7 @@ void PadViewFrame::OnMouseLeft(wxMouseEvent& event) instance.m_pad_mouse.position = { physPos.x, physPos.y }; if (event.ButtonDown(wxMOUSE_BTN_LEFT)) instance.m_pad_mouse.left_down_toggle = true; - + } void PadViewFrame::OnMouseRight(wxMouseEvent& event) diff --git a/src/gui/canvas/MetalCanvas.cpp b/src/gui/canvas/MetalCanvas.cpp new file mode 100644 index 000000000..a9d1cb9dc --- /dev/null +++ b/src/gui/canvas/MetalCanvas.cpp @@ -0,0 +1,62 @@ +#include "gui/canvas/MetalCanvas.h" +#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" +#include "gui/guiWrapper.h" + +#include +#include + +MetalCanvas::MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window) + : IRenderCanvas(is_main_window), wxWindow(parent, wxID_ANY, wxDefaultPosition, size, wxNO_FULL_REPAINT_ON_RESIZE | wxWANTS_CHARS) +{ + Bind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Bind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + WindowHandleInfo& canvas = is_main_window ? gui_getWindowInfo().canvas_main : gui_getWindowInfo().canvas_pad; + gui_initHandleContextFromWxWidgetsWindow(canvas, this); + + try + { + if (is_main_window) + g_renderer = std::make_unique(); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->InitializeLayer({size.x, size.y}, is_main_window); + } + catch(const std::exception& ex) + { + cemuLog_log(LogType::Force, "Error when initializing Metal renderer: {}", ex.what()); + auto msg = formatWxString(_("Error when initializing Metal renderer:\n{}"), ex.what()); + wxMessageDialog dialog(this, msg, _("Error"), wxOK | wxCENTRE | wxICON_ERROR); + dialog.ShowModal(); + exit(0); + } + + wxWindow::EnableTouchEvents(wxTOUCH_PAN_GESTURES); +} + +MetalCanvas::~MetalCanvas() +{ + Unbind(wxEVT_PAINT, &MetalCanvas::OnPaint, this); + Unbind(wxEVT_SIZE, &MetalCanvas::OnResize, this); + + MetalRenderer* mtlr = (MetalRenderer*)g_renderer.get(); + if (mtlr) + mtlr->ShutdownLayer(m_is_main_window); +} + +void MetalCanvas::OnPaint(wxPaintEvent& event) +{ +} + +void MetalCanvas::OnResize(wxSizeEvent& event) +{ + const wxSize size = GetSize(); + if (size.GetWidth() == 0 || size.GetHeight() == 0) + return; + + const wxRect refreshRect(size); + RefreshRect(refreshRect, false); + + auto metal_renderer = MetalRenderer::GetInstance(); + metal_renderer->ResizeLayer({size.x, size.y}, m_is_main_window); +} diff --git a/src/gui/canvas/MetalCanvas.h b/src/gui/canvas/MetalCanvas.h new file mode 100644 index 000000000..4dc4d49f9 --- /dev/null +++ b/src/gui/canvas/MetalCanvas.h @@ -0,0 +1,19 @@ +#pragma once + +#include "gui/canvas/IRenderCanvas.h" + +#include + +#include + +class MetalCanvas : public IRenderCanvas, public wxWindow +{ +public: + MetalCanvas(wxWindow* parent, const wxSize& size, bool is_main_window); + ~MetalCanvas(); + +private: + + void OnPaint(wxPaintEvent& event); + void OnResize(wxSizeEvent& event); +}; diff --git a/src/gui/components/wxGameList.cpp b/src/gui/components/wxGameList.cpp index 6cbb58594..fb03843a2 100644 --- a/src/gui/components/wxGameList.cpp +++ b/src/gui/components/wxGameList.cpp @@ -69,8 +69,11 @@ std::list _getCachesPaths(const TitleId& titleId) ActiveSettings::GetCachePath(L"shaderCache/driver/vk/{:016x}.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_spirv.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_gl.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/precompiled/{:016x}_air.bin", titleId), ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_shaders.bin", titleId), - ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId)}; + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlshaders.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_vkpipeline.bin", titleId), + ActiveSettings::GetCachePath(L"shaderCache/transferable/{:016x}_mtlpipeline.bin", titleId)}; cachePaths.remove_if( [](const fs::path& cachePath) @@ -200,13 +203,13 @@ void wxGameList::OnGameListSize(wxSizeEvent &event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -938,13 +941,13 @@ void wxGameList::OnColumnBeginResize(wxListEvent& event) for(int i = GetColumnCount() - 1; i > 0; i--) { #ifdef wxHAS_LISTCTRL_COLUMN_ORDER - if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) + if(GetColumnWidth(GetColumnIndexFromOrder(i)) > 0) { last_col_index = GetColumnIndexFromOrder(i); break; } #else - if(GetColumnWidth(i) > 0) + if(GetColumnWidth(i) > 0) { last_col_index = i; break; @@ -1076,7 +1079,7 @@ void wxGameList::OnGameEntryUpdatedByTitleId(wxTitleIdEvent& event) wxString minutesText = formatWxString(wxPLURAL("{} minute", "{} minutes", minutes), minutes); SetItem(index, ColumnGameTime, hoursText + " " + minutesText); } - + // last played if (playTimeStat.last_played.year != 0) { @@ -1290,7 +1293,7 @@ bool wxGameList::QueryIconForTitle(TitleId titleId, int& icon, int& iconSmall) return true; } -void wxGameList::DeleteCachedStrings() +void wxGameList::DeleteCachedStrings() { m_name_cache.clear(); } @@ -1448,7 +1451,7 @@ void wxGameList::CreateShortcut(GameInfo2& gameInfo) if (SUCCEEDED(hres)) { hres = shellLinkFile->Save(outputPath.wc_str(), TRUE); - shellLinkFile->Release(); + shellLinkFile->Release(); } shellLink->Release(); } @@ -1457,4 +1460,4 @@ void wxGameList::CreateShortcut(GameInfo2& gameInfo) wxMessageBox(errorMsg, _("Error"), wxOK | wxCENTRE | wxICON_ERROR); } } -#endif \ No newline at end of file +#endif diff --git a/src/gui/guiWrapper.cpp b/src/gui/guiWrapper.cpp index d887e89a1..8f004eddb 100644 --- a/src/gui/guiWrapper.cpp +++ b/src/gui/guiWrapper.cpp @@ -82,11 +82,14 @@ void gui_updateWindowTitles(bool isIdle, bool isLoading, double fps) case RendererAPI::OpenGL: renderer = "[OpenGL]"; break; - case RendererAPI::Vulkan: + case RendererAPI::Vulkan: renderer = "[Vulkan]"; break; + case RendererAPI::Metal: + renderer = "[Metal]"; + break; default: ; - } + } } // get GPU vendor/mode @@ -217,7 +220,7 @@ void gui_initHandleContextFromWxWidgetsWindow(WindowHandleInfo& handleInfoOut, c cemuLog_log(LogType::Force, "Unable to get xlib display"); } } - else + else #ifdef HAS_WAYLAND if(GDK_IS_WAYLAND_WINDOW(gdkWindow)) { diff --git a/src/imgui/CMakeLists.txt b/src/imgui/CMakeLists.txt index db7686bd8..86aeb130f 100644 --- a/src/imgui/CMakeLists.txt +++ b/src/imgui/CMakeLists.txt @@ -7,6 +7,15 @@ add_library(imguiImpl imgui_extension.h ) +if (ENABLE_METAL) + target_sources(imguiImpl PRIVATE + imgui_impl_metal.mm + imgui_impl_metal.h + ) + + target_compile_definitions(imguiImpl PRIVATE IMGUI_IMPL_METAL_CPP) +endif () + set_property(TARGET imguiImpl PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") target_include_directories(imguiImpl PUBLIC "../") diff --git a/src/imgui/imgui_impl_metal.h b/src/imgui/imgui_impl_metal.h new file mode 100644 index 000000000..3aaacb9e0 --- /dev/null +++ b/src/imgui/imgui_impl_metal.h @@ -0,0 +1,64 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +#include "imgui.h" // IMGUI_IMPL_API + +//----------------------------------------------------------------------------- +// ObjC API +//----------------------------------------------------------------------------- + +#ifdef __OBJC__ + +@class MTLRenderPassDescriptor; +@protocol MTLDevice, MTLCommandBuffer, MTLRenderCommandEncoder; + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, + id commandBuffer, + id commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(id device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif + +//----------------------------------------------------------------------------- +// C++ API +//----------------------------------------------------------------------------- + +// Enable Metal C++ binding support with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file +// More info about using Metal from C++: https://developer.apple.com/metal/cpp/ + +#ifdef IMGUI_IMPL_METAL_CPP +#include +#ifndef __OBJC__ + +IMGUI_IMPL_API bool ImGui_ImplMetal_Init(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_Shutdown(); +IMGUI_IMPL_API void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor); +IMGUI_IMPL_API void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder); + +// Called by Init/NewFrame/Shutdown +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyFontsTexture(); +IMGUI_IMPL_API bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device); +IMGUI_IMPL_API void ImGui_ImplMetal_DestroyDeviceObjects(); + +#endif +#endif diff --git a/src/imgui/imgui_impl_metal.mm b/src/imgui/imgui_impl_metal.mm new file mode 100644 index 000000000..5f0588573 --- /dev/null +++ b/src/imgui/imgui_impl_metal.mm @@ -0,0 +1,575 @@ +// dear imgui: Renderer Backend for Metal +// This needs to be used along with a Platform Backend (e.g. OSX) + +// Implemented features: +// [X] Renderer: User texture binding. Use 'MTLTexture' as ImTextureID. Read the FAQ about ImTextureID! +// [X] Renderer: Large meshes support (64k+ vertices) with 16-bit indices. + +// You can use unmodified imgui_impl_* files in your project. See examples/ folder for examples of using this. +// Prefer including the entire imgui/ repository into your project (either as a copy or as a submodule), and only build the backends you need. +// If you are new to Dear ImGui, read documentation from the docs/ folder + read the top of imgui.cpp. +// Read online: https://github.com/ocornut/imgui/tree/master/docs + +// CHANGELOG +// (minor and older changes stripped away, please see git history for details) +// 2022-08-23: Metal: Update deprecated property 'sampleCount'->'rasterSampleCount'. +// 2022-07-05: Metal: Add dispatch synchronization. +// 2022-06-30: Metal: Use __bridge for ARC based systems. +// 2022-06-01: Metal: Fixed null dereference on exit inside command buffer completion handler. +// 2022-04-27: Misc: Store backend data in a per-context struct, allowing to use this backend with multiple contexts. +// 2022-01-03: Metal: Ignore ImDrawCmd where ElemCount == 0 (very rare but can technically be manufactured by user code). +// 2021-12-30: Metal: Added Metal C++ support. Enable with '#define IMGUI_IMPL_METAL_CPP' in your imconfig.h file. +// 2021-08-24: Metal: Fixed a crash when clipping rect larger than framebuffer is submitted. (#4464) +// 2021-05-19: Metal: Replaced direct access to ImDrawCmd::TextureId with a call to ImDrawCmd::GetTexID(). (will become a requirement) +// 2021-02-18: Metal: Change blending equation to preserve alpha in output buffer. +// 2021-01-25: Metal: Fixed texture storage mode when building on Mac Catalyst. +// 2019-05-29: Metal: Added support for large mesh (64K+ vertices), enable ImGuiBackendFlags_RendererHasVtxOffset flag. +// 2019-04-30: Metal: Added support for special ImDrawCallback_ResetRenderState callback to reset render state. +// 2019-02-11: Metal: Projecting clipping rectangles correctly using draw_data->FramebufferScale to allow multi-viewports for retina display. +// 2018-11-30: Misc: Setting up io.BackendRendererName so it can be displayed in the About Window. +// 2018-07-05: Metal: Added new Metal backend implementation. + +#include "imgui.h" +#include "imgui_impl_metal.h" +#import +#import + +#pragma mark - Support classes + +// A wrapper around a MTLBuffer object that knows the last time it was reused +@interface MetalBuffer : NSObject +@property (nonatomic, strong) id buffer; +@property (nonatomic, assign) double lastReuseTime; +- (instancetype)initWithBuffer:(id)buffer; +@end + +// An object that encapsulates the data necessary to uniquely identify a +// render pipeline state. These are used as cache keys. +@interface FramebufferDescriptor : NSObject +@property (nonatomic, assign) unsigned long sampleCount; +@property (nonatomic, assign) MTLPixelFormat colorPixelFormat; +@property (nonatomic, assign) MTLPixelFormat depthPixelFormat; +@property (nonatomic, assign) MTLPixelFormat stencilPixelFormat; +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor; +@end + +// A singleton that stores long-lived objects that are needed by the Metal +// renderer backend. Stores the render pipeline state cache and the default +// font texture, and manages the reusable buffer cache. +@interface MetalContext : NSObject +@property (nonatomic, strong) id device; +@property (nonatomic, strong) id depthStencilState; +@property (nonatomic, strong) FramebufferDescriptor* framebufferDescriptor; // framebuffer descriptor for current frame; transient +@property (nonatomic, strong) NSMutableDictionary* renderPipelineStateCache; // pipeline cache; keyed on framebuffer descriptors +@property (nonatomic, strong, nullable) id fontTexture; +@property (nonatomic, strong) NSMutableArray* bufferCache; +@property (nonatomic, assign) double lastBufferCachePurge; +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device; +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device; +@end + +struct ImGui_ImplMetal_Data +{ + MetalContext* SharedMetalContext; + + ImGui_ImplMetal_Data() { memset(this, 0, sizeof(*this)); } +}; + +static ImGui_ImplMetal_Data* ImGui_ImplMetal_CreateBackendData() { return IM_NEW(ImGui_ImplMetal_Data)(); } +static ImGui_ImplMetal_Data* ImGui_ImplMetal_GetBackendData() { return ImGui::GetCurrentContext() ? (ImGui_ImplMetal_Data*)ImGui::GetIO().BackendRendererUserData : nullptr; } +static void ImGui_ImplMetal_DestroyBackendData(){ IM_DELETE(ImGui_ImplMetal_GetBackendData()); } + +static inline CFTimeInterval GetMachAbsoluteTimeInSeconds() { return (CFTimeInterval)(double)(clock_gettime_nsec_np(CLOCK_UPTIME_RAW) / 1e9); } + +#ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal C++ Backend API + +bool ImGui_ImplMetal_Init(MTL::Device* device) +{ + return ImGui_ImplMetal_Init((__bridge id)(device)); +} + +void ImGui_ImplMetal_NewFrame(MTL::RenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_NewFrame((__bridge MTLRenderPassDescriptor*)(renderPassDescriptor)); +} + +void ImGui_ImplMetal_RenderDrawData(ImDrawData* draw_data, + MTL::CommandBuffer* commandBuffer, + MTL::RenderCommandEncoder* commandEncoder) +{ + ImGui_ImplMetal_RenderDrawData(draw_data, + (__bridge id)(commandBuffer), + (__bridge id)(commandEncoder)); + +} + +bool ImGui_ImplMetal_CreateFontsTexture(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateFontsTexture((__bridge id)(device)); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(MTL::Device* device) +{ + return ImGui_ImplMetal_CreateDeviceObjects((__bridge id)(device)); +} + +#endif // #ifdef IMGUI_IMPL_METAL_CPP + +#pragma mark - Dear ImGui Metal Backend API + +bool ImGui_ImplMetal_Init(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_CreateBackendData(); + ImGuiIO& io = ImGui::GetIO(); + io.BackendRendererUserData = (void*)bd; + io.BackendRendererName = "imgui_impl_metal"; + io.BackendFlags |= ImGuiBackendFlags_RendererHasVtxOffset; // We can honor the ImDrawCmd::VtxOffset field, allowing for large meshes. + + bd->SharedMetalContext = [[MetalContext alloc] init]; + bd->SharedMetalContext.device = device; + + return true; +} + +void ImGui_ImplMetal_Shutdown() +{ + ImGui_ImplMetal_DestroyDeviceObjects(); + ImGui_ImplMetal_DestroyBackendData(); +} + +void ImGui_ImplMetal_NewFrame(MTLRenderPassDescriptor* renderPassDescriptor) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + IM_ASSERT(bd->SharedMetalContext != nil && "No Metal context. Did you call ImGui_ImplMetal_Init() ?"); + bd->SharedMetalContext.framebufferDescriptor = [[FramebufferDescriptor alloc] initWithRenderPassDescriptor:renderPassDescriptor]; + + if (bd->SharedMetalContext.depthStencilState == nil) + ImGui_ImplMetal_CreateDeviceObjects(bd->SharedMetalContext.device); +} + +static void ImGui_ImplMetal_SetupRenderState(ImDrawData* drawData, id commandBuffer, + id commandEncoder, id renderPipelineState, + MetalBuffer* vertexBuffer, size_t vertexBufferOffset) +{ + IM_UNUSED(commandBuffer); + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + [commandEncoder setCullMode:MTLCullModeNone]; + [commandEncoder setDepthStencilState:bd->SharedMetalContext.depthStencilState]; + + // Setup viewport, orthographic projection matrix + // Our visible imgui space lies from draw_data->DisplayPos (top left) to + // draw_data->DisplayPos+data_data->DisplaySize (bottom right). DisplayMin is typically (0,0) for single viewport apps. + MTLViewport viewport = + { + .originX = 0.0, + .originY = 0.0, + .width = (double)(drawData->DisplaySize.x * drawData->FramebufferScale.x), + .height = (double)(drawData->DisplaySize.y * drawData->FramebufferScale.y), + .znear = 0.0, + .zfar = 1.0 + }; + [commandEncoder setViewport:viewport]; + + float L = drawData->DisplayPos.x; + float R = drawData->DisplayPos.x + drawData->DisplaySize.x; + float T = drawData->DisplayPos.y; + float B = drawData->DisplayPos.y + drawData->DisplaySize.y; + float N = (float)viewport.znear; + float F = (float)viewport.zfar; + const float ortho_projection[4][4] = + { + { 2.0f/(R-L), 0.0f, 0.0f, 0.0f }, + { 0.0f, 2.0f/(T-B), 0.0f, 0.0f }, + { 0.0f, 0.0f, 1/(F-N), 0.0f }, + { (R+L)/(L-R), (T+B)/(B-T), N/(F-N), 1.0f }, + }; + [commandEncoder setVertexBytes:&ortho_projection length:sizeof(ortho_projection) atIndex:1]; + + [commandEncoder setRenderPipelineState:renderPipelineState]; + + [commandEncoder setVertexBuffer:vertexBuffer.buffer offset:0 atIndex:0]; + [commandEncoder setVertexBufferOffset:vertexBufferOffset atIndex:0]; +} + +// Metal Render function. +void ImGui_ImplMetal_RenderDrawData(ImDrawData* drawData, id commandBuffer, id commandEncoder) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MetalContext* ctx = bd->SharedMetalContext; + + // Avoid rendering when minimized, scale coordinates for retina displays (screen coordinates != framebuffer coordinates) + int fb_width = (int)(drawData->DisplaySize.x * drawData->FramebufferScale.x); + int fb_height = (int)(drawData->DisplaySize.y * drawData->FramebufferScale.y); + if (fb_width <= 0 || fb_height <= 0 || drawData->CmdListsCount == 0) + return; + + // Try to retrieve a render pipeline state that is compatible with the framebuffer config for this frame + // The hit rate for this cache should be very near 100%. + id renderPipelineState = ctx.renderPipelineStateCache[ctx.framebufferDescriptor]; + if (renderPipelineState == nil) + { + // No luck; make a new render pipeline state + renderPipelineState = [ctx renderPipelineStateForFramebufferDescriptor:ctx.framebufferDescriptor device:commandBuffer.device]; + + // Cache render pipeline state for later reuse + ctx.renderPipelineStateCache[ctx.framebufferDescriptor] = renderPipelineState; + } + + size_t vertexBufferLength = (size_t)drawData->TotalVtxCount * sizeof(ImDrawVert); + size_t indexBufferLength = (size_t)drawData->TotalIdxCount * sizeof(ImDrawIdx); + MetalBuffer* vertexBuffer = [ctx dequeueReusableBufferOfLength:vertexBufferLength device:commandBuffer.device]; + MetalBuffer* indexBuffer = [ctx dequeueReusableBufferOfLength:indexBufferLength device:commandBuffer.device]; + + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, 0); + + // Will project scissor/clipping rectangles into framebuffer space + ImVec2 clip_off = drawData->DisplayPos; // (0,0) unless using multi-viewports + ImVec2 clip_scale = drawData->FramebufferScale; // (1,1) unless using retina display which are often (2,2) + + // Render command lists + size_t vertexBufferOffset = 0; + size_t indexBufferOffset = 0; + for (int n = 0; n < drawData->CmdListsCount; n++) + { + const ImDrawList* cmd_list = drawData->CmdLists[n]; + + memcpy((char*)vertexBuffer.buffer.contents + vertexBufferOffset, cmd_list->VtxBuffer.Data, (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert)); + memcpy((char*)indexBuffer.buffer.contents + indexBufferOffset, cmd_list->IdxBuffer.Data, (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx)); + + for (int cmd_i = 0; cmd_i < cmd_list->CmdBuffer.Size; cmd_i++) + { + const ImDrawCmd* pcmd = &cmd_list->CmdBuffer[cmd_i]; + if (pcmd->UserCallback) + { + // User callback, registered via ImDrawList::AddCallback() + // (ImDrawCallback_ResetRenderState is a special callback value used by the user to request the renderer to reset render state.) + if (pcmd->UserCallback == ImDrawCallback_ResetRenderState) + ImGui_ImplMetal_SetupRenderState(drawData, commandBuffer, commandEncoder, renderPipelineState, vertexBuffer, vertexBufferOffset); + else + pcmd->UserCallback(cmd_list, pcmd); + } + else + { + // Project scissor/clipping rectangles into framebuffer space + ImVec2 clip_min((pcmd->ClipRect.x - clip_off.x) * clip_scale.x, (pcmd->ClipRect.y - clip_off.y) * clip_scale.y); + ImVec2 clip_max((pcmd->ClipRect.z - clip_off.x) * clip_scale.x, (pcmd->ClipRect.w - clip_off.y) * clip_scale.y); + + // Clamp to viewport as setScissorRect() won't accept values that are off bounds + if (clip_min.x < 0.0f) { clip_min.x = 0.0f; } + if (clip_min.y < 0.0f) { clip_min.y = 0.0f; } + if (clip_max.x > fb_width) { clip_max.x = (float)fb_width; } + if (clip_max.y > fb_height) { clip_max.y = (float)fb_height; } + if (clip_max.x <= clip_min.x || clip_max.y <= clip_min.y) + continue; + if (pcmd->ElemCount == 0) // drawIndexedPrimitives() validation doesn't accept this + continue; + + // Apply scissor/clipping rectangle + MTLScissorRect scissorRect = + { + .x = NSUInteger(clip_min.x), + .y = NSUInteger(clip_min.y), + .width = NSUInteger(clip_max.x - clip_min.x), + .height = NSUInteger(clip_max.y - clip_min.y) + }; + [commandEncoder setScissorRect:scissorRect]; + + // Bind texture, Draw + if (ImTextureID tex_id = pcmd->GetTexID()) + [commandEncoder setFragmentTexture:(__bridge id)(tex_id) atIndex:0]; + + [commandEncoder setVertexBufferOffset:(vertexBufferOffset + pcmd->VtxOffset * sizeof(ImDrawVert)) atIndex:0]; + [commandEncoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle + indexCount:pcmd->ElemCount + indexType:sizeof(ImDrawIdx) == 2 ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32 + indexBuffer:indexBuffer.buffer + indexBufferOffset:indexBufferOffset + pcmd->IdxOffset * sizeof(ImDrawIdx)]; + } + } + + vertexBufferOffset += (size_t)cmd_list->VtxBuffer.Size * sizeof(ImDrawVert); + indexBufferOffset += (size_t)cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx); + } + + [commandBuffer addCompletedHandler:^(id) + { + dispatch_async(dispatch_get_main_queue(), ^{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + if (bd != nullptr) + { + @synchronized(bd->SharedMetalContext.bufferCache) + { + [bd->SharedMetalContext.bufferCache addObject:vertexBuffer]; + [bd->SharedMetalContext.bufferCache addObject:indexBuffer]; + } + } + }); + }]; +} + +bool ImGui_ImplMetal_CreateFontsTexture(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); + + // We are retrieving and uploading the font atlas as a 4-channels RGBA texture here. + // In theory we could call GetTexDataAsAlpha8() and upload a 1-channel texture to save on memory access bandwidth. + // However, using a shader designed for 1-channel texture would make it less obvious to use the ImTextureID facility to render users own textures. + // You can make that change in your implementation. + unsigned char* pixels; + int width, height; + io.Fonts->GetTexDataAsRGBA32(&pixels, &width, &height); + MTLTextureDescriptor* textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA8Unorm + width:(NSUInteger)width + height:(NSUInteger)height + mipmapped:NO]; + textureDescriptor.usage = MTLTextureUsageShaderRead; +#if TARGET_OS_OSX || TARGET_OS_MACCATALYST + textureDescriptor.storageMode = MTLStorageModeManaged; +#else + textureDescriptor.storageMode = MTLStorageModeShared; +#endif + id texture = [device newTextureWithDescriptor:textureDescriptor]; + [texture replaceRegion:MTLRegionMake2D(0, 0, (NSUInteger)width, (NSUInteger)height) mipmapLevel:0 withBytes:pixels bytesPerRow:(NSUInteger)width * 4]; + bd->SharedMetalContext.fontTexture = texture; + io.Fonts->SetTexID((__bridge void*)bd->SharedMetalContext.fontTexture); // ImTextureID == void* + + return (bd->SharedMetalContext.fontTexture != nil); +} + +void ImGui_ImplMetal_DestroyFontsTexture() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGuiIO& io = ImGui::GetIO(); + bd->SharedMetalContext.fontTexture = nil; + io.Fonts->SetTexID(nullptr); +} + +bool ImGui_ImplMetal_CreateDeviceObjects(id device) +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + MTLDepthStencilDescriptor* depthStencilDescriptor = [[MTLDepthStencilDescriptor alloc] init]; + depthStencilDescriptor.depthWriteEnabled = NO; + depthStencilDescriptor.depthCompareFunction = MTLCompareFunctionAlways; + bd->SharedMetalContext.depthStencilState = [device newDepthStencilStateWithDescriptor:depthStencilDescriptor]; + ImGui_ImplMetal_CreateFontsTexture(device); + + return true; +} + +void ImGui_ImplMetal_DestroyDeviceObjects() +{ + ImGui_ImplMetal_Data* bd = ImGui_ImplMetal_GetBackendData(); + ImGui_ImplMetal_DestroyFontsTexture(); + [bd->SharedMetalContext.renderPipelineStateCache removeAllObjects]; +} + +#pragma mark - MetalBuffer implementation + +@implementation MetalBuffer +- (instancetype)initWithBuffer:(id)buffer +{ + if ((self = [super init])) + { + _buffer = buffer; + _lastReuseTime = GetMachAbsoluteTimeInSeconds(); + } + return self; +} +@end + +#pragma mark - FramebufferDescriptor implementation + +@implementation FramebufferDescriptor +- (instancetype)initWithRenderPassDescriptor:(MTLRenderPassDescriptor*)renderPassDescriptor +{ + if ((self = [super init])) + { + _sampleCount = renderPassDescriptor.colorAttachments[0].texture.sampleCount; + _colorPixelFormat = renderPassDescriptor.colorAttachments[0].texture.pixelFormat; + _depthPixelFormat = renderPassDescriptor.depthAttachment.texture.pixelFormat; + _stencilPixelFormat = renderPassDescriptor.stencilAttachment.texture.pixelFormat; + } + return self; +} + +- (nonnull id)copyWithZone:(nullable NSZone*)zone +{ + FramebufferDescriptor* copy = [[FramebufferDescriptor allocWithZone:zone] init]; + copy.sampleCount = self.sampleCount; + copy.colorPixelFormat = self.colorPixelFormat; + copy.depthPixelFormat = self.depthPixelFormat; + copy.stencilPixelFormat = self.stencilPixelFormat; + return copy; +} + +- (NSUInteger)hash +{ + NSUInteger sc = _sampleCount & 0x3; + NSUInteger cf = _colorPixelFormat & 0x3FF; + NSUInteger df = _depthPixelFormat & 0x3FF; + NSUInteger sf = _stencilPixelFormat & 0x3FF; + NSUInteger hash = (sf << 22) | (df << 12) | (cf << 2) | sc; + return hash; +} + +- (BOOL)isEqual:(id)object +{ + FramebufferDescriptor* other = object; + if (![other isKindOfClass:[FramebufferDescriptor class]]) + return NO; + return other.sampleCount == self.sampleCount && + other.colorPixelFormat == self.colorPixelFormat && + other.depthPixelFormat == self.depthPixelFormat && + other.stencilPixelFormat == self.stencilPixelFormat; +} + +@end + +#pragma mark - MetalContext implementation + +@implementation MetalContext +- (instancetype)init +{ + if ((self = [super init])) + { + self.renderPipelineStateCache = [NSMutableDictionary dictionary]; + self.bufferCache = [NSMutableArray array]; + _lastBufferCachePurge = GetMachAbsoluteTimeInSeconds(); + } + return self; +} + +- (MetalBuffer*)dequeueReusableBufferOfLength:(NSUInteger)length device:(id)device +{ + uint64_t now = GetMachAbsoluteTimeInSeconds(); + + @synchronized(self.bufferCache) + { + // Purge old buffers that haven't been useful for a while + if (now - self.lastBufferCachePurge > 1.0) + { + NSMutableArray* survivors = [NSMutableArray array]; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.lastReuseTime > self.lastBufferCachePurge) + [survivors addObject:candidate]; + self.bufferCache = [survivors mutableCopy]; + self.lastBufferCachePurge = now; + } + + // See if we have a buffer we can reuse + MetalBuffer* bestCandidate = nil; + for (MetalBuffer* candidate in self.bufferCache) + if (candidate.buffer.length >= length && (bestCandidate == nil || bestCandidate.lastReuseTime > candidate.lastReuseTime)) + bestCandidate = candidate; + + if (bestCandidate != nil) + { + [self.bufferCache removeObject:bestCandidate]; + bestCandidate.lastReuseTime = now; + return bestCandidate; + } + } + + // No luck; make a new buffer + id backing = [device newBufferWithLength:length options:MTLResourceStorageModeShared]; + return [[MetalBuffer alloc] initWithBuffer:backing]; +} + +// Bilinear sampling is required by default. Set 'io.Fonts->Flags |= ImFontAtlasFlags_NoBakedLines' or 'style.AntiAliasedLinesUseTex = false' to allow point/nearest sampling. +- (id)renderPipelineStateForFramebufferDescriptor:(FramebufferDescriptor*)descriptor device:(id)device +{ + NSError* error = nil; + + NSString* shaderSource = @"" + "#include \n" + "using namespace metal;\n" + "\n" + "struct Uniforms {\n" + " float4x4 projectionMatrix;\n" + "};\n" + "\n" + "struct VertexIn {\n" + " float2 position [[attribute(0)]];\n" + " float2 texCoords [[attribute(1)]];\n" + " uchar4 color [[attribute(2)]];\n" + "};\n" + "\n" + "struct VertexOut {\n" + " float4 position [[position]];\n" + " float2 texCoords;\n" + " float4 color;\n" + "};\n" + "\n" + "vertex VertexOut vertex_main(VertexIn in [[stage_in]],\n" + " constant Uniforms &uniforms [[buffer(1)]]) {\n" + " VertexOut out;\n" + " out.position = uniforms.projectionMatrix * float4(in.position, 0, 1);\n" + " out.texCoords = in.texCoords;\n" + " out.color = float4(in.color) / float4(255.0);\n" + " return out;\n" + "}\n" + "\n" + "fragment half4 fragment_main(VertexOut in [[stage_in]],\n" + " texture2d texture [[texture(0)]]) {\n" + " constexpr sampler linearSampler(coord::normalized, min_filter::linear, mag_filter::linear, mip_filter::linear);\n" + " half4 texColor = texture.sample(linearSampler, in.texCoords);\n" + " return half4(in.color) * texColor;\n" + "}\n"; + + id library = [device newLibraryWithSource:shaderSource options:nil error:&error]; + if (library == nil) + { + NSLog(@"Error: failed to create Metal library: %@", error); + return nil; + } + + id vertexFunction = [library newFunctionWithName:@"vertex_main"]; + id fragmentFunction = [library newFunctionWithName:@"fragment_main"]; + + if (vertexFunction == nil || fragmentFunction == nil) + { + NSLog(@"Error: failed to find Metal shader functions in library: %@", error); + return nil; + } + + MTLVertexDescriptor* vertexDescriptor = [MTLVertexDescriptor vertexDescriptor]; + vertexDescriptor.attributes[0].offset = IM_OFFSETOF(ImDrawVert, pos); + vertexDescriptor.attributes[0].format = MTLVertexFormatFloat2; // position + vertexDescriptor.attributes[0].bufferIndex = 0; + vertexDescriptor.attributes[1].offset = IM_OFFSETOF(ImDrawVert, uv); + vertexDescriptor.attributes[1].format = MTLVertexFormatFloat2; // texCoords + vertexDescriptor.attributes[1].bufferIndex = 0; + vertexDescriptor.attributes[2].offset = IM_OFFSETOF(ImDrawVert, col); + vertexDescriptor.attributes[2].format = MTLVertexFormatUChar4; // color + vertexDescriptor.attributes[2].bufferIndex = 0; + vertexDescriptor.layouts[0].stepRate = 1; + vertexDescriptor.layouts[0].stepFunction = MTLVertexStepFunctionPerVertex; + vertexDescriptor.layouts[0].stride = sizeof(ImDrawVert); + + MTLRenderPipelineDescriptor* pipelineDescriptor = [[MTLRenderPipelineDescriptor alloc] init]; + pipelineDescriptor.vertexFunction = vertexFunction; + pipelineDescriptor.fragmentFunction = fragmentFunction; + pipelineDescriptor.vertexDescriptor = vertexDescriptor; + pipelineDescriptor.rasterSampleCount = self.framebufferDescriptor.sampleCount; + pipelineDescriptor.colorAttachments[0].pixelFormat = self.framebufferDescriptor.colorPixelFormat; + pipelineDescriptor.colorAttachments[0].blendingEnabled = YES; + pipelineDescriptor.colorAttachments[0].rgbBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceRGBBlendFactor = MTLBlendFactorSourceAlpha; + pipelineDescriptor.colorAttachments[0].destinationRGBBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.colorAttachments[0].alphaBlendOperation = MTLBlendOperationAdd; + pipelineDescriptor.colorAttachments[0].sourceAlphaBlendFactor = MTLBlendFactorOne; + pipelineDescriptor.colorAttachments[0].destinationAlphaBlendFactor = MTLBlendFactorOneMinusSourceAlpha; + pipelineDescriptor.depthAttachmentPixelFormat = self.framebufferDescriptor.depthPixelFormat; + pipelineDescriptor.stencilAttachmentPixelFormat = self.framebufferDescriptor.stencilPixelFormat; + + id renderPipelineState = [device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:&error]; + if (error != nil) + NSLog(@"Error: failed to create Metal pipeline state: %@", error); + + return renderPipelineState; +} + +@end diff --git a/src/tools/ShaderCacheMerger.cpp b/src/tools/ShaderCacheMerger.cpp index 14a54252a..7a2727dd0 100644 --- a/src/tools/ShaderCacheMerger.cpp +++ b/src/tools/ShaderCacheMerger.cpp @@ -106,6 +106,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_shaders.bin)"))) MergeShaderCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlshaders.bin)"))) + MergeShaderCacheFile(filename); } printf("\nScanning for pipeline cache files to merge...\n"); for (const auto& it : fs::directory_iterator("shaderCache/transferable/")) @@ -115,6 +117,8 @@ void MergeShaderAndPipelineCacheFiles() auto filename = it.path().filename().generic_string(); if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_vkpipeline.bin)"))) MergePipelineCacheFile(filename); + if (std::regex_match(filename, std::regex("^[0-9a-fA-F]{16}(?:_mtlpipeline.bin)"))) + MergePipelineCacheFile(filename); } }