Make the ExeFS dumper take updates into account

Merge pull request #4242 from ReinUsesLisp/maxwell-dma
maxwell_dma: Match official doc and support pitch->voxel copies
2022-02-15 22:16:50 +01:00 · 2020-07-14 14:04:16 -04:00 · 2020-07-14 12:38:03 -04:00 · 2020-07-14 12:16:42 -04:00 · 2020-07-14 12:01:43 -04:00 · 2020-07-14 11:23:10 -04:00
387 changed files with 16974 additions and 6225 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report-feature-request.md
+++ b/.github/ISSUE_TEMPLATE/bug-report-feature-request.md
@@ -1,4 +1,13 @@
-<!--
+---
+name: Bug Report / Feature Request
+about: Tech support does not belong here. You should only file an issue here if you think you have experienced an actual bug with yuzu or you are requesting a feature you believe would make yuzu better.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+<!---
 Please keep in mind yuzu is EXPERIMENTAL SOFTWARE.

 Please read the FAQ:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: yuzu Discord
+    url: https://discord.com/invite/u77vRWY
+    about: If you are experiencing an issue with yuzu, and you need tech support, or if you have a general question, try asking in the official yuzu Discord linked here. Piracy is not allowed.
+  - name: Community forums
+    url: https://community.citra-emu.org
+    about: This is an alternative place for tech support, however helpers there are not as active.
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,9 @@
 [submodule "soundtouch"]
    path = externals/soundtouch
    url = https://github.com/citra-emu/ext-soundtouch.git
+[submodule "libressl"]
+    path = externals/libressl
+    url = https://github.com/citra-emu/ext-libressl-portable.git
 [submodule "discord-rpc"]
    path = externals/discord-rpc
    url = https://github.com/discordapp/discord-rpc.git
@@ -31,3 +34,9 @@
 [submodule "xbyak"]
    path = externals/xbyak
    url = https://github.com/herumi/xbyak.git
+[submodule "externals/libusb"]
+	path = externals/libusb
+	url = https://github.com/ameerj/libusb
+[submodule "opus"]
+	path = externals/opus/opus
+	url = https://github.com/xiph/opus.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,8 +118,17 @@ message(STATUS "Target architecture: ${ARCHITECTURE}")
 # Configure C++ standard
 # ===========================

-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# boost asio's concept usage doesn't play nicely with some compilers yet.
+add_definitions(-DBOOST_ASIO_DISABLE_CONCEPTS)
+if (MSVC)
+    add_compile_options(/std:c++latest)
+
+    # cubeb and boost still make use of deprecated result_of.
+    add_definitions(-D_HAS_DEPRECATED_RESULT_OF)
+else()
+    set(CMAKE_CXX_STANDARD 20)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()

 # Output binaries to bin/
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
@@ -151,14 +160,11 @@ macro(yuzu_find_packages)
    #    Cmake Pkg Prefix  Version     Conan Pkg
        "Boost             1.71        boost/1.72.0"
        "Catch2            2.11        catch2/2.11.0"
-        "fmt               6.2         fmt/6.2.0"
-        "OpenSSL           1.1         openssl/1.1.1f"
+        "fmt               7.0         fmt/7.0.1"
    # can't use until https://github.com/bincrafters/community/issues/1173
        #"libzip            1.5         libzip/1.5.2@bincrafters/stable"
        "lz4               1.8         lz4/1.9.2"
        "nlohmann_json     3.7         nlohmann_json/3.7.3"
-    # we need to be careful as the version check might be broken https://github.com/xiph/opus/issues/110
-        "opus              1.3         opus/1.3.1"
        "ZLIB              1.2         zlib/1.2.11"
        "zstd              1.4         zstd/1.4.4"
    )
@@ -214,7 +220,10 @@ if(ENABLE_QT)

        set(QT_PREFIX_HINT HINTS "${QT_PREFIX}")
    endif()
-    find_package(Qt5 5.9 COMPONENTS Widgets OpenGL ${QT_PREFIX_HINT})
+    find_package(Qt5 5.9 COMPONENTS Widgets ${QT_PREFIX_HINT})
+    if (YUZU_USE_QT_WEB_ENGINE)
+        find_package(Qt5 COMPONENTS WebEngineCore WebEngineWidgets)
+    endif()
    if (NOT Qt5_FOUND)
        list(APPEND CONAN_REQUIRED_LIBS "qt/5.14.1@bincrafters/stable")
    endif()
@@ -287,7 +296,7 @@ if (CONAN_REQUIRED_LIBS)
    if(ENABLE_QT)
        list(APPEND CMAKE_MODULE_PATH "${CONAN_QT_ROOT_RELEASE}")
        list(APPEND CMAKE_PREFIX_PATH "${CONAN_QT_ROOT_RELEASE}")
-        find_package(Qt5 5.9 REQUIRED COMPONENTS Widgets OpenGL)
+        find_package(Qt5 5.9 REQUIRED COMPONENTS Widgets)
        if (YUZU_USE_QT_WEB_ENGINE)
            find_package(Qt5 REQUIRED COMPONENTS WebEngineCore WebEngineWidgets)
        endif()
@@ -312,15 +321,6 @@ elseif (TARGET Boost::boost)
    add_library(boost ALIAS Boost::boost)
 endif()

-if (NOT TARGET OpenSSL::SSL)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL)
-endif()
-if (NOT TARGET OpenSSL::Crypto)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL)
-endif()
-
 if (TARGET sdl2::sdl2)
    # imported from the conan generated sdl2Config.cmake
    set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE)
@@ -338,6 +338,17 @@ elseif(SDL2_FOUND)
    target_link_libraries(SDL2 INTERFACE "${SDL2_LIBRARIES}")
 endif()

+# Ensure libusb is properly configured (based on dolphin libusb include)
+if(NOT APPLE)
+    include(FindPkgConfig)
+    find_package(LibUSB)
+endif()
+if (NOT LIBUSB_FOUND)
+    add_subdirectory(externals/libusb)
+    set(LIBUSB_INCLUDE_DIR "")
+    set(LIBUSB_LIBRARIES usb)
+endif()
+
 # Prefer the -pthread flag on Linux.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
--- a/CMakeModules/CopyYuzuQt5Deps.cmake
+++ b/CMakeModules/CopyYuzuQt5Deps.cmake
@@ -15,7 +15,6 @@ function(copy_yuzu_Qt5_deps target_dir)
        icuuc*.dll
        Qt5Core$<$<CONFIG:Debug>:d>.*
        Qt5Gui$<$<CONFIG:Debug>:d>.*
-        Qt5OpenGL$<$<CONFIG:Debug>:d>.*
        Qt5Widgets$<$<CONFIG:Debug>:d>.*
    )

--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -51,6 +51,8 @@ endif()
 # The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR)
 set(VIDEO_CORE "${SRC_DIR}/src/video_core")
 set(HASH_FILES
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ yuzu emulator
 =============
 [![Travis CI Build Status](https://travis-ci.com/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.com/yuzu-emu/yuzu)
 [![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)
-[![Discord](https://img.shields.io/discord/398318088170242053?color=%237289DA&label=yuzu&logo=discord&logoColor=white)](https://discord.gg/XQV6dn9)
+[![Discord](https://img.shields.io/discord/398318088170242053?color=%237289DA&label=yuzu&logo=discord&logoColor=white)](https://discord.com/invite/u77vRWY)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).

@@ -16,7 +16,7 @@ yuzu is licensed under the GPLv2 (or any later version). Refer to the license.tx

 Check out our [website](https://yuzu-emu.org/)!

-For development discussion, please join us on [Discord](https://discord.gg/XQV6dn9).
+For development discussion, please join us on [Discord](https://discord.com/invite/u77vRWY).

 ### Development

--- a/dist/qt_themes/qdarkstyle/style.qss
+++ b/dist/qt_themes/qdarkstyle/style.qss
@@ -673,10 +673,6 @@ QTabWidget::pane {
    border-bottom-left-radius: 2px;
 }

-QTabWidget::tab-bar {
-    overflow: visible;
-}
-
 QTabBar {
    qproperty-drawBase: 0;
    border-radius: 3px;
--- a/dist/yuzu.manifest
+++ b/dist/yuzu.manifest
@@ -1,24 +1,58 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
- <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
-  <security>
-   <requestedPrivileges>
-    <requestedExecutionLevel level="asInvoker" uiAccess="false"/>
-   </requestedPrivileges>
-  </security>
- </trustInfo>
- <application xmlns="urn:schemas-microsoft-com:asm.v3">
-  <windowsSettings>
-   <dpiAware xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">True/PM</dpiAware>
-   <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
-  </windowsSettings>
- </application>
- <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
-  <application>
-   <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
-   <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
-   <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
-   <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
-  </application>
- </compatibility>
-</assembly>
+<assembly manifestVersion="1.0"
+    xmlns="urn:schemas-microsoft-com:asm.v1"
+    xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+  <asmv3:application>
+    <asmv3:windowsSettings>
+      <!-- Windows 7/8/8.1/10 -->
+      <dpiAware
+        xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">
+        true/pm
+      </dpiAware>
+      <!-- Windows 10, version 1607 or later -->
+      <dpiAwareness
+        xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        PerMonitorV2
+      </dpiAwareness>
+      <!-- Windows 10, version 1703 or later -->
+      <gdiScaling
+          xmlns="http://schemas.microsoft.com/SMI/2017/WindowsSettings">
+        true
+      </gdiScaling>
+      <ws2:longPathAware
+          xmlns:ws3="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        true
+      </ws2:longPathAware>
+    </asmv3:windowsSettings>
+  </asmv3:application>
+  <compatibility
+      xmlns="urn:schemas-microsoft-com:compatibility.v1">
+    <application>
+      <!-- Windows 10 -->
+      <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+      <!-- Windows 8.1 -->
+      <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+      <!-- Windows 8 -->
+      <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+      <!-- Windows 7 -->
+      <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+    </application>
+  </compatibility>
+  <trustInfo
+      xmlns="urn:schemas-microsoft-com:asm.v3">
+    <security>
+      <requestedPrivileges>
+        <!--
+          UAC settings:
+          - app should run at same integrity level as calling process
+          - app does not need to manipulate windows belonging to
+            higher-integrity-level processes
+          -->
+        <requestedExecutionLevel
+            level="asInvoker"
+            uiAccess="false"
+        />
+      </requestedPrivileges>
+    </security>
+  </trustInfo>
+</assembly>
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
 include(DownloadExternals)

+# xbyak
+if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+    add_library(xbyak INTERFACE)
+    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+endif()
+
 # Catch
 add_library(catch-single-include INTERFACE)
 target_include_directories(catch-single-include INTERFACE catch/single_include)
@@ -66,6 +73,15 @@ if (NOT LIBZIP_FOUND)
 endif()

 if (ENABLE_WEB_SERVICE)
+    # LibreSSL
+    set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
+    add_subdirectory(libressl EXCLUDE_FROM_ALL)
+    target_include_directories(ssl INTERFACE ./libressl/include)
+    target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP)
+    get_directory_property(OPENSSL_LIBRARIES
+        DIRECTORY libressl
+        DEFINITION OPENSSL_LIBS)
+
    # lurlparser
    add_subdirectory(lurlparser EXCLUDE_FROM_ALL)

@@ -73,13 +89,8 @@ if (ENABLE_WEB_SERVICE)
    add_library(httplib INTERFACE)
    target_include_directories(httplib INTERFACE ./httplib)
    target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
-    target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
+    target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES})
 endif()

-if (NOT TARGET xbyak)
-    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
-        add_library(xbyak INTERFACE)
-        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
-        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
-    endif()
-endif()
+# Opus
+add_subdirectory(opus)
--- a/externals/Vulkan-Headers
+++ b/externals/Vulkan-Headers
--- a/externals/dynarmic
+++ b/externals/dynarmic
--- a/externals/find-modules/FindLibUSB.cmake
+++ b/externals/find-modules/FindLibUSB.cmake
@@ -0,0 +1,43 @@
+# - Find libusb-1.0 library
+# This module defines
+#  LIBUSB_INCLUDE_DIR, where to find bluetooth.h
+#  LIBUSB_LIBRARIES, the libraries needed to use libusb-1.0.
+#  LIBUSB_FOUND, If false, do not try to use libusb-1.0.
+#
+# Copyright (c) 2009, Michal Cihar, <michal@cihar.com>
+#
+# vim: expandtab sw=4 ts=4 sts=4:
+
+if(ANDROID)
+       set(LIBUSB_FOUND FALSE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "libusb-1.0 not found.")
+elseif (NOT LIBUSB_FOUND)
+    pkg_check_modules (LIBUSB_PKG libusb-1.0)
+
+    find_path(LIBUSB_INCLUDE_DIR NAMES libusb.h
+       PATHS
+       ${LIBUSB_PKG_INCLUDE_DIRS}
+       /usr/include/libusb-1.0
+       /usr/include
+       /usr/local/include/libusb-1.0
+       /usr/local/include
+    )
+
+    find_library(LIBUSB_LIBRARIES NAMES usb-1.0 usb
+       PATHS
+       ${LIBUSB_PKG_LIBRARY_DIRS}
+       /usr/lib
+       /usr/local/lib
+    )
+
+    if(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+       set(LIBUSB_FOUND TRUE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "Found libusb-1.0: ${LIBUSB_INCLUDE_DIR}, ${LIBUSB_LIBRARIES}")
+    else(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+       set(LIBUSB_FOUND FALSE CACHE INTERNAL "libusb-1.0 found")
+       message(STATUS "libusb-1.0 not found.")
+    endif(LIBUSB_INCLUDE_DIR AND LIBUSB_LIBRARIES)
+
+    mark_as_advanced(LIBUSB_INCLUDE_DIR LIBUSB_LIBRARIES)
+endif ()
+
--- a/externals/libressl
+++ b/externals/libressl
--- a/externals/libusb
+++ b/externals/libusb
--- a/externals/opus/CMakeLists.txt
+++ b/externals/opus/CMakeLists.txt
@@ -0,0 +1,254 @@
+cmake_minimum_required(VERSION 3.8)
+
+project(opus)
+
+option(OPUS_STACK_PROTECTOR "Use stack protection" OFF)
+option(OPUS_USE_ALLOCA "Use alloca for stack arrays (on non-C99 compilers)" OFF)
+option(OPUS_CUSTOM_MODES "Enable non-Opus modes, e.g. 44.1 kHz & 2^n frames" OFF)
+option(OPUS_FIXED_POINT "Compile as fixed-point (for machines without a fast enough FPU)" OFF)
+option(OPUS_ENABLE_FLOAT_API "Compile with the floating point API (for machines with float library" ON)
+
+include(opus/opus_functions.cmake)
+
+if(OPUS_STACK_PROTECTOR)
+    if(NOT MSVC) # GC on by default on MSVC
+        check_and_set_flag(STACK_PROTECTION_STRONG -fstack-protector-strong)
+    endif()
+else()
+    if(MSVC)
+        check_and_set_flag(BUFFER_SECURITY_CHECK /GS-)
+    endif()
+endif()
+
+add_library(opus STATIC
+    # CELT sources
+    opus/celt/bands.c
+    opus/celt/celt.c
+    opus/celt/celt_decoder.c
+    opus/celt/celt_encoder.c
+    opus/celt/celt_lpc.c
+    opus/celt/cwrs.c
+    opus/celt/entcode.c
+    opus/celt/entdec.c
+    opus/celt/entenc.c
+    opus/celt/kiss_fft.c
+    opus/celt/laplace.c
+    opus/celt/mathops.c
+    opus/celt/mdct.c
+    opus/celt/modes.c
+    opus/celt/pitch.c
+    opus/celt/quant_bands.c
+    opus/celt/rate.c
+    opus/celt/vq.c
+
+    # SILK sources
+    opus/silk/A2NLSF.c
+    opus/silk/CNG.c
+    opus/silk/HP_variable_cutoff.c
+    opus/silk/LPC_analysis_filter.c
+    opus/silk/LPC_fit.c
+    opus/silk/LPC_inv_pred_gain.c
+    opus/silk/LP_variable_cutoff.c
+    opus/silk/NLSF2A.c
+    opus/silk/NLSF_VQ.c
+    opus/silk/NLSF_VQ_weights_laroia.c
+    opus/silk/NLSF_decode.c
+    opus/silk/NLSF_del_dec_quant.c
+    opus/silk/NLSF_encode.c
+    opus/silk/NLSF_stabilize.c
+    opus/silk/NLSF_unpack.c
+    opus/silk/NSQ.c
+    opus/silk/NSQ_del_dec.c
+    opus/silk/PLC.c
+    opus/silk/VAD.c
+    opus/silk/VQ_WMat_EC.c
+    opus/silk/ana_filt_bank_1.c
+    opus/silk/biquad_alt.c
+    opus/silk/bwexpander.c
+    opus/silk/bwexpander_32.c
+    opus/silk/check_control_input.c
+    opus/silk/code_signs.c
+    opus/silk/control_SNR.c
+    opus/silk/control_audio_bandwidth.c
+    opus/silk/control_codec.c
+    opus/silk/dec_API.c
+    opus/silk/decode_core.c
+    opus/silk/decode_frame.c
+    opus/silk/decode_indices.c
+    opus/silk/decode_parameters.c
+    opus/silk/decode_pitch.c
+    opus/silk/decode_pulses.c
+    opus/silk/decoder_set_fs.c
+    opus/silk/enc_API.c
+    opus/silk/encode_indices.c
+    opus/silk/encode_pulses.c
+    opus/silk/gain_quant.c
+    opus/silk/init_decoder.c
+    opus/silk/init_encoder.c
+    opus/silk/inner_prod_aligned.c
+    opus/silk/interpolate.c
+    opus/silk/lin2log.c
+    opus/silk/log2lin.c
+    opus/silk/pitch_est_tables.c
+    opus/silk/process_NLSFs.c
+    opus/silk/quant_LTP_gains.c
+    opus/silk/resampler.c
+    opus/silk/resampler_down2.c
+    opus/silk/resampler_down2_3.c
+    opus/silk/resampler_private_AR2.c
+    opus/silk/resampler_private_IIR_FIR.c
+    opus/silk/resampler_private_down_FIR.c
+    opus/silk/resampler_private_up2_HQ.c
+    opus/silk/resampler_rom.c
+    opus/silk/shell_coder.c
+    opus/silk/sigm_Q15.c
+    opus/silk/sort.c
+    opus/silk/stereo_LR_to_MS.c
+    opus/silk/stereo_MS_to_LR.c
+    opus/silk/stereo_decode_pred.c
+    opus/silk/stereo_encode_pred.c
+    opus/silk/stereo_find_predictor.c
+    opus/silk/stereo_quant_pred.c
+    opus/silk/sum_sqr_shift.c
+    opus/silk/table_LSF_cos.c
+    opus/silk/tables_LTP.c
+    opus/silk/tables_NLSF_CB_NB_MB.c
+    opus/silk/tables_NLSF_CB_WB.c
+    opus/silk/tables_gain.c
+    opus/silk/tables_other.c
+    opus/silk/tables_pitch_lag.c
+    opus/silk/tables_pulses_per_block.c
+
+    # Opus sources
+    opus/src/analysis.c
+    opus/src/mapping_matrix.c
+    opus/src/mlp.c
+    opus/src/mlp_data.c
+    opus/src/opus.c
+    opus/src/opus_decoder.c
+    opus/src/opus_encoder.c
+    opus/src/opus_multistream.c
+    opus/src/opus_multistream_decoder.c
+    opus/src/opus_multistream_encoder.c
+    opus/src/opus_projection_decoder.c
+    opus/src/opus_projection_encoder.c
+    opus/src/repacketizer.c
+)
+
+if (DEBUG)
+    target_sources(opus PRIVATE opus/silk/debug.c)
+endif()
+
+if (OPUS_FIXED_POINT)
+    target_sources(opus PRIVATE
+        opus/silk/fixed/LTP_analysis_filter_FIX.c
+        opus/silk/fixed/LTP_scale_ctrl_FIX.c
+        opus/silk/fixed/apply_sine_window_FIX.c
+        opus/silk/fixed/autocorr_FIX.c
+        opus/silk/fixed/burg_modified_FIX.c
+        opus/silk/fixed/corrMatrix_FIX.c
+        opus/silk/fixed/encode_frame_FIX.c
+        opus/silk/fixed/find_LPC_FIX.c
+        opus/silk/fixed/find_LTP_FIX.c
+        opus/silk/fixed/find_pitch_lags_FIX.c
+        opus/silk/fixed/find_pred_coefs_FIX.c
+        opus/silk/fixed/k2a_FIX.c
+        opus/silk/fixed/k2a_Q16_FIX.c
+        opus/silk/fixed/noise_shape_analysis_FIX.c
+        opus/silk/fixed/pitch_analysis_core_FIX.c
+        opus/silk/fixed/prefilter_FIX.c
+        opus/silk/fixed/process_gains_FIX.c
+        opus/silk/fixed/regularize_correlations_FIX.c
+        opus/silk/fixed/residual_energy16_FIX.c
+        opus/silk/fixed/residual_energy_FIX.c
+        opus/silk/fixed/schur64_FIX.c
+        opus/silk/fixed/schur_FIX.c
+        opus/silk/fixed/solve_LS_FIX.c
+        opus/silk/fixed/vector_ops_FIX.c
+        opus/silk/fixed/warped_autocorrelation_FIX.c
+    )
+else()
+    target_sources(opus PRIVATE
+        opus/silk/float/LPC_analysis_filter_FLP.c
+        opus/silk/float/LPC_inv_pred_gain_FLP.c
+        opus/silk/float/LTP_analysis_filter_FLP.c
+        opus/silk/float/LTP_scale_ctrl_FLP.c
+        opus/silk/float/apply_sine_window_FLP.c
+        opus/silk/float/autocorrelation_FLP.c
+        opus/silk/float/burg_modified_FLP.c
+        opus/silk/float/bwexpander_FLP.c
+        opus/silk/float/corrMatrix_FLP.c
+        opus/silk/float/encode_frame_FLP.c
+        opus/silk/float/energy_FLP.c
+        opus/silk/float/find_LPC_FLP.c
+        opus/silk/float/find_LTP_FLP.c
+        opus/silk/float/find_pitch_lags_FLP.c
+        opus/silk/float/find_pred_coefs_FLP.c
+        opus/silk/float/inner_product_FLP.c
+        opus/silk/float/k2a_FLP.c
+        opus/silk/float/noise_shape_analysis_FLP.c
+        opus/silk/float/pitch_analysis_core_FLP.c
+        opus/silk/float/process_gains_FLP.c
+        opus/silk/float/regularize_correlations_FLP.c
+        opus/silk/float/residual_energy_FLP.c
+        opus/silk/float/scale_copy_vector_FLP.c
+        opus/silk/float/scale_vector_FLP.c
+        opus/silk/float/schur_FLP.c
+        opus/silk/float/sort_FLP.c
+        opus/silk/float/warped_autocorrelation_FLP.c
+        opus/silk/float/wrappers_FLP.c
+    )
+endif()
+
+target_compile_definitions(opus PRIVATE OPUS_BUILD ENABLE_HARDENING)
+
+if(NOT MSVC)
+    if(MINGW)
+        target_compile_definitions(opus PRIVATE _FORTIFY_SOURCE=0)
+    else()
+        target_compile_definitions(opus PRIVATE _FORTIFY_SOURCE=2)
+    endif()
+endif()
+
+# It is strongly recommended to uncomment one of these VAR_ARRAYS: Use C99
+# variable-length arrays for stack allocation USE_ALLOCA: Use alloca() for stack
+# allocation If none is defined, then the fallback is a non-threadsafe global
+# array
+if(OPUS_USE_ALLOCA OR MSVC)
+    target_compile_definitions(opus PRIVATE USE_ALLOCA)
+else()
+    target_compile_definitions(opus PRIVATE VAR_ARRAYS)
+endif()
+
+if(OPUS_CUSTOM_MODES)
+    target_compile_definitions(opus PRIVATE CUSTOM_MODES)
+endif()
+
+if(NOT OPUS_ENABLE_FLOAT_API)
+    target_compile_definitions(opus PRIVATE DISABLE_FLOAT_API)
+endif()
+
+target_compile_definitions(opus
+PUBLIC
+    -DOPUS_VERSION="\\"1.3.1\\""
+
+PRIVATE
+    # Use C99 intrinsics to speed up float-to-int conversion
+    HAVE_LRINTF
+)
+
+if (FIXED_POINT)
+    target_compile_definitions(opus PRIVATE -DFIXED_POINT=1 -DDISABLE_FLOAT_API)
+endif()
+
+target_include_directories(opus
+PUBLIC
+    opus/include
+
+PRIVATE
+    opus/celt
+    opus/silk
+    opus/silk/fixed
+    opus/silk/float
+    opus/src
+)
--- a/externals/opus/opus
+++ b/externals/opus/opus
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,6 +62,10 @@ else()
        -Wno-unused-parameter
    )

+    if (ARCHITECTURE_x86_64)
+        add_compile_options("-mcx16")
+    endif()
+
    if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)
        add_compile_options("-stdlib=libc++")
    endif()
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -180,11 +180,12 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<

    // Copy output header
    UpdateDataHeader response_data{worker_params};
-    std::vector<u8> output_params(response_data.total_size);
    if (behavior_info.IsElapsedFrameCountSupported()) {
-        response_data.frame_count = 0x10;
-        response_data.total_size += 0x10;
+        response_data.render_info = sizeof(RendererInfo);
+        response_data.total_size += sizeof(RendererInfo);
    }
+
+    std::vector<u8> output_params(response_data.total_size);
    std::memcpy(output_params.data(), &response_data, sizeof(UpdateDataHeader));

    // Copy output memory pool entries
@@ -219,6 +220,17 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<
        return Audren::ERR_INVALID_PARAMETERS;
    }

+    if (behavior_info.IsElapsedFrameCountSupported()) {
+        const std::size_t renderer_info_offset{
+            sizeof(UpdateDataHeader) + response_data.memory_pools_size + response_data.voices_size +
+            response_data.effects_size + response_data.sinks_size +
+            response_data.performance_manager_size + response_data.behavior_size};
+        RendererInfo renderer_info{};
+        renderer_info.elasped_frame_count = elapsed_frame_count;
+        std::memcpy(output_params.data() + renderer_info_offset, &renderer_info,
+                    sizeof(RendererInfo));
+    }
+
    return MakeResult(output_params);
 }

@@ -447,6 +459,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
        }
    }
    audio_out->QueueBuffer(stream, tag, std::move(buffer));
+    elapsed_frame_count++;
 }

 void AudioRenderer::ReleaseAndQueueBuffers() {
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -196,6 +196,12 @@ struct EffectOutStatus {
 };
 static_assert(sizeof(EffectOutStatus) == 0x10, "EffectOutStatus is an invalid size");

+struct RendererInfo {
+    u64_le elasped_frame_count{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(RendererInfo) == 0x10, "RendererInfo is an invalid size");
+
 struct UpdateDataHeader {
    UpdateDataHeader() {}

@@ -209,7 +215,7 @@ struct UpdateDataHeader {
        mixes_size = 0x0;
        sinks_size = config.sink_count * 0x20;
        performance_manager_size = 0x10;
-        frame_count = 0;
+        render_info = 0;
        total_size = sizeof(UpdateDataHeader) + behavior_size + memory_pools_size + voices_size +
                     effects_size + sinks_size + performance_manager_size;
    }
@@ -223,8 +229,8 @@ struct UpdateDataHeader {
    u32_le mixes_size{};
    u32_le sinks_size{};
    u32_le performance_manager_size{};
-    INSERT_PADDING_WORDS(1);
-    u32_le frame_count{};
+    u32_le splitter_size{};
+    u32_le render_info{};
    INSERT_PADDING_WORDS(4);
    u32_le total_size{};
 };
@@ -258,6 +264,7 @@ private:
    std::unique_ptr<AudioOut> audio_out;
    StreamPtr stream;
    Core::Memory::Memory& memory;
+    std::size_t elapsed_frame_count{};
 };

 } // namespace AudioCore
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -193,7 +193,7 @@ long CubebSinkStream::DataCallback(cubeb_stream* stream, void* user_data, const
    const std::size_t samples_to_write = num_channels * num_frames;
    std::size_t samples_written;

-    if (Settings::values.enable_audio_stretching) {
+    if (Settings::values.enable_audio_stretching.GetValue()) {
        const std::vector<s16> in{impl->queue.Pop()};
        const std::size_t num_in{in.size() / num_channels};
        s16* const out{reinterpret_cast<s16*>(buffer)};
--- a/src/audio_core/stream.cpp
+++ b/src/audio_core/stream.cpp
@@ -38,7 +38,7 @@ Stream::Stream(Core::Timing::CoreTiming& core_timing, u32 sample_rate, Format fo
      sink_stream{sink_stream}, core_timing{core_timing}, name{std::move(name_)} {

    release_event = Core::Timing::CreateEvent(
-        name, [this](u64 userdata, s64 cycles_late) { ReleaseActiveBuffer(); });
+        name, [this](u64 userdata, s64 cycles_late) { ReleaseActiveBuffer(cycles_late); });
 }

 void Stream::Play() {
@@ -59,15 +59,15 @@ Stream::State Stream::GetState() const {
    return state;
 }

-s64 Stream::GetBufferReleaseCycles(const Buffer& buffer) const {
+s64 Stream::GetBufferReleaseNS(const Buffer& buffer) const {
    const std::size_t num_samples{buffer.GetSamples().size() / GetNumChannels()};
-    const auto us =
-        std::chrono::microseconds((static_cast<u64>(num_samples) * 1000000) / sample_rate);
-    return Core::Timing::usToCycles(us);
+    const auto ns =
+        std::chrono::nanoseconds((static_cast<u64>(num_samples) * 1000000000ULL) / sample_rate);
+    return ns.count();
 }

 static void VolumeAdjustSamples(std::vector<s16>& samples, float game_volume) {
-    const float volume{std::clamp(Settings::values.volume - (1.0f - game_volume), 0.0f, 1.0f)};
+    const float volume{std::clamp(Settings::Volume() - (1.0f - game_volume), 0.0f, 1.0f)};

    if (volume == 1.0f) {
        return;
@@ -80,7 +80,7 @@ static void VolumeAdjustSamples(std::vector<s16>& samples, float game_volume) {
    }
 }

-void Stream::PlayNextBuffer() {
+void Stream::PlayNextBuffer(s64 cycles_late) {
    if (!IsPlaying()) {
        // Ensure we are in playing state before playing the next buffer
        sink_stream.Flush();
@@ -105,14 +105,17 @@ void Stream::PlayNextBuffer() {

    sink_stream.EnqueueSamples(GetNumChannels(), active_buffer->GetSamples());

-    core_timing.ScheduleEvent(GetBufferReleaseCycles(*active_buffer), release_event, {});
+    core_timing.ScheduleEvent(
+        GetBufferReleaseNS(*active_buffer) -
+            (Settings::values.enable_audio_stretching.GetValue() ? 0 : cycles_late),
+        release_event, {});
 }

-void Stream::ReleaseActiveBuffer() {
+void Stream::ReleaseActiveBuffer(s64 cycles_late) {
    ASSERT(active_buffer);
    released_buffers.push(std::move(active_buffer));
    release_callback();
-    PlayNextBuffer();
+    PlayNextBuffer(cycles_late);
 }

 bool Stream::QueueBuffer(BufferPtr&& buffer) {
--- a/src/audio_core/stream.h
+++ b/src/audio_core/stream.h
@@ -90,13 +90,16 @@ public:

 private:
    /// Plays the next queued buffer in the audio stream, starting playback if necessary
-    void PlayNextBuffer();
+    void PlayNextBuffer(s64 cycles_late = 0);

    /// Releases the actively playing buffer, signalling that it has been completed
-    void ReleaseActiveBuffer();
+    void ReleaseActiveBuffer(s64 cycles_late = 0);

    /// Gets the number of core cycles when the specified buffer will be released
-    s64 GetBufferReleaseCycles(const Buffer& buffer) const;
+    s64 GetBufferReleaseNS(const Buffer& buffer) const;
+
+    /// Gets the number of core cycles when the specified buffer will be released
+    s64 GetBufferReleaseNSHostTiming(const Buffer& buffer) const;

    u32 sample_rate;                  ///< Sample rate of the stream
    Format format;                    ///< Format of the stream
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp
    DEPENDS
      # WARNING! It was too much work to try and make a common location for this list,
      # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
@@ -96,6 +98,8 @@ add_library(common STATIC
    algorithm.h
    alignment.h
    assert.h
+    atomic_ops.cpp
+    atomic_ops.h
    detached_tasks.cpp
    detached_tasks.h
    bit_field.h
@@ -108,6 +112,8 @@ add_library(common STATIC
    common_types.h
    dynamic_library.cpp
    dynamic_library.h
+    fiber.cpp
+    fiber.h
    file_util.cpp
    file_util.h
    hash.h
@@ -141,6 +147,8 @@ add_library(common STATIC
    scm_rev.cpp
    scm_rev.h
    scope_exit.h
+    spin_lock.cpp
+    spin_lock.h
    string_util.cpp
    string_util.h
    swap.h
@@ -161,6 +169,8 @@ add_library(common STATIC
    vector_math.h
    virtual_buffer.cpp
    virtual_buffer.h
+    wall_clock.cpp
+    wall_clock.h
    web_result.h
    zstd_compression.cpp
    zstd_compression.h
@@ -171,12 +181,15 @@ if(ARCHITECTURE_x86_64)
        PRIVATE
            x64/cpu_detect.cpp
            x64/cpu_detect.h
+            x64/native_clock.cpp
+            x64/native_clock.h
            x64/xbyak_abi.h
            x64/xbyak_util.h
    )
 endif()

 create_target_directory_groups(common)
+find_package(Boost 1.71 COMPONENTS context headers REQUIRED)

-target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
+target_link_libraries(common PUBLIC ${Boost_LIBRARIES} fmt::fmt microprofile)
 target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
--- a/src/common/alignment.h
+++ b/src/common/alignment.h
@@ -11,7 +11,9 @@ namespace Common {
 template <typename T>
 constexpr T AlignUp(T value, std::size_t size) {
    static_assert(std::is_unsigned_v<T>, "T must be an unsigned value.");
-    return static_cast<T>(value + (size - value % size) % size);
+    auto mod{static_cast<T>(value % size)};
+    value -= mod;
+    return static_cast<T>(mod == T{0} ? value : value + size);
 }

 template <typename T>
--- a/src/common/atomic_ops.cpp
+++ b/src/common/atomic_ops.cpp
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/atomic_ops.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace Common {
+
+#if _MSC_VER
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected) {
+    u8 result = _InterlockedCompareExchange8((char*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected) {
+    u16 result = _InterlockedCompareExchange16((short*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected) {
+    u32 result = _InterlockedCompareExchange((long*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected) {
+    u64 result = _InterlockedCompareExchange64((__int64*)pointer, value, expected);
+    return result == expected;
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected) {
+    return _InterlockedCompareExchange128((__int64*)pointer, value[1], value[0],
+                                          (__int64*)expected.data()) != 0;
+}
+
+#else
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected) {
+    unsigned __int128 value_a;
+    unsigned __int128 expected_a;
+    std::memcpy(&value_a, value.data(), sizeof(u128));
+    std::memcpy(&expected_a, expected.data(), sizeof(u128));
+    return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
+}
+
+#endif
+
+} // namespace Common
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@@ -0,0 +1,17 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Common {
+
+bool AtomicCompareAndSwap(u8 volatile* pointer, u8 value, u8 expected);
+bool AtomicCompareAndSwap(u16 volatile* pointer, u16 value, u16 expected);
+bool AtomicCompareAndSwap(u32 volatile* pointer, u32 value, u32 expected);
+bool AtomicCompareAndSwap(u64 volatile* pointer, u64 value, u64 expected);
+bool AtomicCompareAndSwap(u64 volatile* pointer, u128 value, u128 expected);
+
+} // namespace Common
--- a/src/common/fiber.cpp
+++ b/src/common/fiber.cpp
@@ -0,0 +1,222 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/fiber.h"
+#if defined(_WIN32) || defined(WIN32)
+#include <windows.h>
+#else
+#include <boost/context/detail/fcontext.hpp>
+#endif
+
+namespace Common {
+
+constexpr std::size_t default_stack_size = 256 * 1024; // 256kb
+
+#if defined(_WIN32) || defined(WIN32)
+
+struct Fiber::FiberImpl {
+    LPVOID handle = nullptr;
+    LPVOID rewind_handle = nullptr;
+};
+
+void Fiber::Start() {
+    ASSERT(previous_fiber != nullptr);
+    previous_fiber->guard.unlock();
+    previous_fiber.reset();
+    entry_point(start_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::OnRewind() {
+    ASSERT(impl->handle != nullptr);
+    DeleteFiber(impl->handle);
+    impl->handle = impl->rewind_handle;
+    impl->rewind_handle = nullptr;
+    rewind_point(rewind_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::FiberStartFunc(void* fiber_parameter) {
+    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    fiber->Start();
+}
+
+void Fiber::RewindStartFunc(void* fiber_parameter) {
+    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    fiber->OnRewind();
+}
+
+Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
+    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
+    impl = std::make_unique<FiberImpl>();
+    impl->handle = CreateFiber(default_stack_size, &FiberStartFunc, this);
+}
+
+Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}
+
+Fiber::~Fiber() {
+    if (released) {
+        return;
+    }
+    // Make sure the Fiber is not being used
+    const bool locked = guard.try_lock();
+    ASSERT_MSG(locked, "Destroying a fiber that's still running");
+    if (locked) {
+        guard.unlock();
+    }
+    DeleteFiber(impl->handle);
+}
+
+void Fiber::Exit() {
+    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
+    if (!is_thread_fiber) {
+        return;
+    }
+    ConvertFiberToThread();
+    guard.unlock();
+    released = true;
+}
+
+void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter) {
+    rewind_point = std::move(rewind_func);
+    rewind_parameter = start_parameter;
+}
+
+void Fiber::Rewind() {
+    ASSERT(rewind_point);
+    ASSERT(impl->rewind_handle == nullptr);
+    impl->rewind_handle = CreateFiber(default_stack_size, &RewindStartFunc, this);
+    SwitchToFiber(impl->rewind_handle);
+}
+
+void Fiber::YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to) {
+    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
+    ASSERT_MSG(to != nullptr, "Next fiber is null!");
+    to->guard.lock();
+    to->previous_fiber = from;
+    SwitchToFiber(to->impl->handle);
+    ASSERT(from->previous_fiber != nullptr);
+    from->previous_fiber->guard.unlock();
+    from->previous_fiber.reset();
+}
+
+std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
+    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
+    fiber->guard.lock();
+    fiber->impl->handle = ConvertThreadToFiber(nullptr);
+    fiber->is_thread_fiber = true;
+    return fiber;
+}
+
+#else
+
+struct Fiber::FiberImpl {
+    alignas(64) std::array<u8, default_stack_size> stack;
+    alignas(64) std::array<u8, default_stack_size> rewind_stack;
+    u8* stack_limit;
+    u8* rewind_stack_limit;
+    boost::context::detail::fcontext_t context;
+    boost::context::detail::fcontext_t rewind_context;
+};
+
+void Fiber::Start(boost::context::detail::transfer_t& transfer) {
+    ASSERT(previous_fiber != nullptr);
+    previous_fiber->impl->context = transfer.fctx;
+    previous_fiber->guard.unlock();
+    previous_fiber.reset();
+    entry_point(start_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::OnRewind([[maybe_unused]] boost::context::detail::transfer_t& transfer) {
+    ASSERT(impl->context != nullptr);
+    impl->context = impl->rewind_context;
+    impl->rewind_context = nullptr;
+    u8* tmp = impl->stack_limit;
+    impl->stack_limit = impl->rewind_stack_limit;
+    impl->rewind_stack_limit = tmp;
+    rewind_point(rewind_parameter);
+    UNREACHABLE();
+}
+
+void Fiber::FiberStartFunc(boost::context::detail::transfer_t transfer) {
+    auto fiber = static_cast<Fiber*>(transfer.data);
+    fiber->Start(transfer);
+}
+
+void Fiber::RewindStartFunc(boost::context::detail::transfer_t transfer) {
+    auto fiber = static_cast<Fiber*>(transfer.data);
+    fiber->OnRewind(transfer);
+}
+
+Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
+    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
+    impl = std::make_unique<FiberImpl>();
+    impl->stack_limit = impl->stack.data();
+    impl->rewind_stack_limit = impl->rewind_stack.data();
+    u8* stack_base = impl->stack_limit + default_stack_size;
+    impl->context =
+        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), FiberStartFunc);
+}
+
+void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter) {
+    rewind_point = std::move(rewind_func);
+    rewind_parameter = start_parameter;
+}
+
+Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}
+
+Fiber::~Fiber() {
+    if (released) {
+        return;
+    }
+    // Make sure the Fiber is not being used
+    const bool locked = guard.try_lock();
+    ASSERT_MSG(locked, "Destroying a fiber that's still running");
+    if (locked) {
+        guard.unlock();
+    }
+}
+
+void Fiber::Exit() {
+
+    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
+    if (!is_thread_fiber) {
+        return;
+    }
+    guard.unlock();
+    released = true;
+}
+
+void Fiber::Rewind() {
+    ASSERT(rewind_point);
+    ASSERT(impl->rewind_context == nullptr);
+    u8* stack_base = impl->rewind_stack_limit + default_stack_size;
+    impl->rewind_context =
+        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), RewindStartFunc);
+    boost::context::detail::jump_fcontext(impl->rewind_context, this);
+}
+
+void Fiber::YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to) {
+    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
+    ASSERT_MSG(to != nullptr, "Next fiber is null!");
+    to->guard.lock();
+    to->previous_fiber = from;
+    auto transfer = boost::context::detail::jump_fcontext(to->impl->context, to.get());
+    ASSERT(from->previous_fiber != nullptr);
+    from->previous_fiber->impl->context = transfer.fctx;
+    from->previous_fiber->guard.unlock();
+    from->previous_fiber.reset();
+}
+
+std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
+    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
+    fiber->guard.lock();
+    fiber->is_thread_fiber = true;
+    return fiber;
+}
+
+#endif
+} // namespace Common
--- a/src/common/fiber.h
+++ b/src/common/fiber.h
@@ -0,0 +1,92 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "common/common_types.h"
+#include "common/spin_lock.h"
+
+#if !defined(_WIN32) && !defined(WIN32)
+namespace boost::context::detail {
+struct transfer_t;
+}
+#endif
+
+namespace Common {
+
+/**
+ * Fiber class
+ * a fiber is a userspace thread with it's own context. They can be used to
+ * implement coroutines, emulated threading systems and certain asynchronous
+ * patterns.
+ *
+ * This class implements fibers at a low level, thus allowing greater freedom
+ * to implement such patterns. This fiber class is 'threadsafe' only one fiber
+ * can be running at a time and threads will be locked while trying to yield to
+ * a running fiber until it yields. WARNING exchanging two running fibers between
+ * threads will cause a deadlock. In order to prevent a deadlock, each thread should
+ * have an intermediary fiber, you switch to the intermediary fiber of the current
+ * thread and then from it switch to the expected fiber. This way you can exchange
+ * 2 fibers within 2 different threads.
+ */
+class Fiber {
+public:
+    Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter);
+    ~Fiber();
+
+    Fiber(const Fiber&) = delete;
+    Fiber& operator=(const Fiber&) = delete;
+
+    Fiber(Fiber&&) = default;
+    Fiber& operator=(Fiber&&) = default;
+
+    /// Yields control from Fiber 'from' to Fiber 'to'
+    /// Fiber 'from' must be the currently running fiber.
+    static void YieldTo(std::shared_ptr<Fiber>& from, std::shared_ptr<Fiber>& to);
+    static std::shared_ptr<Fiber> ThreadToFiber();
+
+    void SetRewindPoint(std::function<void(void*)>&& rewind_func, void* start_parameter);
+
+    void Rewind();
+
+    /// Only call from main thread's fiber
+    void Exit();
+
+    /// Changes the start parameter of the fiber. Has no effect if the fiber already started
+    void SetStartParameter(void* new_parameter) {
+        start_parameter = new_parameter;
+    }
+
+private:
+    Fiber();
+
+#if defined(_WIN32) || defined(WIN32)
+    void OnRewind();
+    void Start();
+    static void FiberStartFunc(void* fiber_parameter);
+    static void RewindStartFunc(void* fiber_parameter);
+#else
+    void OnRewind(boost::context::detail::transfer_t& transfer);
+    void Start(boost::context::detail::transfer_t& transfer);
+    static void FiberStartFunc(boost::context::detail::transfer_t transfer);
+    static void RewindStartFunc(boost::context::detail::transfer_t transfer);
+#endif
+
+    struct FiberImpl;
+
+    SpinLock guard{};
+    std::function<void(void*)> entry_point;
+    std::function<void(void*)> rewind_point;
+    void* rewind_parameter{};
+    void* start_parameter{};
+    std::shared_ptr<Fiber> previous_fiber;
+    std::unique_ptr<FiberImpl> impl;
+    bool is_thread_fiber{};
+    bool released{};
+};
+
+} // namespace Common
--- a/src/common/memory_detect.cpp
+++ b/src/common/memory_detect.cpp
@@ -9,10 +9,12 @@
 // clang-format on
 #else
 #include <sys/types.h>
-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 #include <sys/sysctl.h>
-#else
+#elif defined(__linux__)
 #include <sys/sysinfo.h>
+#else
+#include <unistd.h>
 #endif
 #endif

@@ -38,15 +40,26 @@ static MemoryInfo Detect() {
    // hw and vm are defined in sysctl.h
    // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
    // sysctlbyname(const char *, void *, size_t *, void *, size_t);
-    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
-    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
+    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, nullptr, 0);
+    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, nullptr, 0);
    mem_info.TotalPhysicalMemory = ramsize;
    mem_info.TotalSwapMemory = vmusage.xsu_total;
-#else
+#elif defined(__FreeBSD__)
+    u_long physmem, swap_total;
+    std::size_t sizeof_u_long = sizeof(u_long);
+    // sysctlbyname(const char *, void *, size_t *, const void *, size_t);
+    sysctlbyname("hw.physmem", &physmem, &sizeof_u_long, nullptr, 0);
+    sysctlbyname("vm.swap_total", &swap_total, &sizeof_u_long, nullptr, 0);
+    mem_info.TotalPhysicalMemory = physmem;
+    mem_info.TotalSwapMemory = swap_total;
+#elif defined(__linux__)
    struct sysinfo meminfo;
    sysinfo(&meminfo);
    mem_info.TotalPhysicalMemory = meminfo.totalram;
    mem_info.TotalSwapMemory = meminfo.totalswap;
+#else
+    mem_info.TotalPhysicalMemory = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE);
+    mem_info.TotalSwapMemory = 0;
 #endif

    return mem_info;
--- a/src/common/spin_lock.cpp
+++ b/src/common/spin_lock.cpp
@@ -0,0 +1,54 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/spin_lock.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#if _M_AMD64
+#define __x86_64__ 1
+#endif
+#if _M_ARM64
+#define __aarch64__ 1
+#endif
+#else
+#if __x86_64__
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace {
+
+void ThreadPause() {
+#if __x86_64__
+    _mm_pause();
+#elif __aarch64__ && _MSC_VER
+    __yield();
+#elif __aarch64__
+    asm("yield");
+#endif
+}
+
+} // Anonymous namespace
+
+namespace Common {
+
+void SpinLock::lock() {
+    while (lck.test_and_set(std::memory_order_acquire)) {
+        ThreadPause();
+    }
+}
+
+void SpinLock::unlock() {
+    lck.clear(std::memory_order_release);
+}
+
+bool SpinLock::try_lock() {
+    if (lck.test_and_set(std::memory_order_acquire)) {
+        return false;
+    }
+    return true;
+}
+
+} // namespace Common
--- a/src/common/spin_lock.h
+++ b/src/common/spin_lock.h
@@ -0,0 +1,26 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+
+namespace Common {
+
+/**
+ * SpinLock class
+ * a lock similar to mutex that forces a thread to spin wait instead calling the
+ * supervisor. Should be used on short sequences of code.
+ */
+class SpinLock {
+public:
+    void lock();
+    void unlock();
+    bool try_lock();
+
+private:
+    std::atomic_flag lck = ATOMIC_FLAG_INIT;
+};
+
+} // namespace Common
--- a/src/common/telemetry.cpp
+++ b/src/common/telemetry.cpp
@@ -60,6 +60,7 @@ void AppendCPUInfo(FieldCollection& fc) {
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2);
+    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX512", Common::GetCPUCaps().avx512);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI1", Common::GetCPUCaps().bmi1);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI2", Common::GetCPUCaps().bmi2);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_FMA", Common::GetCPUCaps().fma);
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -25,6 +25,52 @@

 namespace Common {

+#ifdef _WIN32
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    auto handle = GetCurrentThread();
+    int windows_priority = 0;
+    switch (new_priority) {
+    case ThreadPriority::Low:
+        windows_priority = THREAD_PRIORITY_BELOW_NORMAL;
+        break;
+    case ThreadPriority::Normal:
+        windows_priority = THREAD_PRIORITY_NORMAL;
+        break;
+    case ThreadPriority::High:
+        windows_priority = THREAD_PRIORITY_ABOVE_NORMAL;
+        break;
+    case ThreadPriority::VeryHigh:
+        windows_priority = THREAD_PRIORITY_HIGHEST;
+        break;
+    default:
+        windows_priority = THREAD_PRIORITY_NORMAL;
+        break;
+    }
+    SetThreadPriority(handle, windows_priority);
+}
+
+#else
+
+void SetCurrentThreadPriority(ThreadPriority new_priority) {
+    pthread_t this_thread = pthread_self();
+
+    s32 max_prio = sched_get_priority_max(SCHED_OTHER);
+    s32 min_prio = sched_get_priority_min(SCHED_OTHER);
+    u32 level = static_cast<u32>(new_priority) + 1;
+
+    struct sched_param params;
+    if (max_prio > min_prio) {
+        params.sched_priority = min_prio + ((max_prio - min_prio) * level) / 4;
+    } else {
+        params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4;
+    }
+
+    pthread_setschedparam(this_thread, SCHED_OTHER, &params);
+}
+
+#endif
+
 #ifdef _MSC_VER

 // Sets the debugger-visible name of the current thread.
@@ -70,6 +116,12 @@ void SetCurrentThreadName(const char* name) {
 }
 #endif

+#if defined(_WIN32)
+void SetCurrentThreadName(const char* name) {
+    // Do Nothing on MingW
+}
+#endif
+
 #endif

 } // namespace Common
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -9,6 +9,7 @@
 #include <cstddef>
 #include <mutex>
 #include <thread>
+#include "common/common_types.h"

 namespace Common {

@@ -28,8 +29,7 @@ public:
        is_set = false;
    }

-    template <class Duration>
-    bool WaitFor(const std::chrono::duration<Duration>& time) {
+    bool WaitFor(const std::chrono::nanoseconds& time) {
        std::unique_lock lk{mutex};
        if (!condvar.wait_for(lk, time, [this] { return is_set; }))
            return false;
@@ -86,6 +86,15 @@ private:
    std::size_t generation = 0; // Incremented once each time the barrier is used
 };

+enum class ThreadPriority : u32 {
+    Low = 0,
+    Normal = 1,
+    High = 2,
+    VeryHigh = 3,
+};
+
+void SetCurrentThreadPriority(ThreadPriority new_priority);
+
 void SetCurrentThreadName(const char* name);

 } // namespace Common
--- a/src/common/uint128.cpp
+++ b/src/common/uint128.cpp
@@ -6,12 +6,38 @@
 #include <intrin.h>

 #pragma intrinsic(_umul128)
+#pragma intrinsic(_udiv128)
 #endif
 #include <cstring>
 #include "common/uint128.h"

 namespace Common {

+#ifdef _MSC_VER
+
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {
+    u128 r{};
+    r[0] = _umul128(a, b, &r[1]);
+    u64 remainder;
+#if _MSC_VER < 1923
+    return udiv128(r[1], r[0], d, &remainder);
+#else
+    return _udiv128(r[1], r[0], d, &remainder);
+#endif
+}
+
+#else
+
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {
+    const u64 diva = a / d;
+    const u64 moda = a % d;
+    const u64 divb = b / d;
+    const u64 modb = b % d;
+    return diva * b + moda * divb + moda * modb / d;
+}
+
+#endif
+
 u128 Multiply64Into128(u64 a, u64 b) {
    u128 result;
 #ifdef _MSC_VER
--- a/src/common/uint128.h
+++ b/src/common/uint128.h
@@ -9,6 +9,9 @@

 namespace Common {

+// This function multiplies 2 u64 values and divides it by a u64 value.
+u64 MultiplyAndDivide64(u64 a, u64 b, u64 d);
+
 // This function multiplies 2 u64 values and produces a u128 value;
 u128 Multiply64Into128(u64 a, u64 b);

--- a/src/common/wall_clock.cpp
+++ b/src/common/wall_clock.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/uint128.h"
+#include "common/wall_clock.h"
+
+#ifdef ARCHITECTURE_x86_64
+#include "common/x64/cpu_detect.h"
+#include "common/x64/native_clock.h"
+#endif
+
+namespace Common {
+
+using base_timer = std::chrono::steady_clock;
+using base_time_point = std::chrono::time_point<base_timer>;
+
+class StandardWallClock : public WallClock {
+public:
+    StandardWallClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency)
+        : WallClock(emulated_cpu_frequency, emulated_clock_frequency, false) {
+        start_time = base_timer::now();
+    }
+
+    std::chrono::nanoseconds GetTimeNS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed);
+    }
+
+    std::chrono::microseconds GetTimeUS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::microseconds>(elapsed);
+    }
+
+    std::chrono::milliseconds GetTimeMS() override {
+        base_time_point current = base_timer::now();
+        auto elapsed = current - start_time;
+        return std::chrono::duration_cast<std::chrono::milliseconds>(elapsed);
+    }
+
+    u64 GetClockCycles() override {
+        std::chrono::nanoseconds time_now = GetTimeNS();
+        const u128 temporary =
+            Common::Multiply64Into128(time_now.count(), emulated_clock_frequency);
+        return Common::Divide128On32(temporary, 1000000000).first;
+    }
+
+    u64 GetCPUCycles() override {
+        std::chrono::nanoseconds time_now = GetTimeNS();
+        const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency);
+        return Common::Divide128On32(temporary, 1000000000).first;
+    }
+
+    void Pause(bool is_paused) override {
+        // Do nothing in this clock type.
+    }
+
+private:
+    base_time_point start_time;
+};
+
+#ifdef ARCHITECTURE_x86_64
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency) {
+    const auto& caps = GetCPUCaps();
+    u64 rtsc_frequency = 0;
+    if (caps.invariant_tsc) {
+        rtsc_frequency = EstimateRDTSCFrequency();
+    }
+    if (rtsc_frequency == 0) {
+        return std::make_unique<StandardWallClock>(emulated_cpu_frequency,
+                                                   emulated_clock_frequency);
+    } else {
+        return std::make_unique<X64::NativeClock>(emulated_cpu_frequency, emulated_clock_frequency,
+                                                  rtsc_frequency);
+    }
+}
+
+#else
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency) {
+    return std::make_unique<StandardWallClock>(emulated_cpu_frequency, emulated_clock_frequency);
+}
+
+#endif
+
+} // namespace Common
--- a/src/common/wall_clock.h
+++ b/src/common/wall_clock.h
@@ -0,0 +1,53 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+
+#include "common/common_types.h"
+
+namespace Common {
+
+class WallClock {
+public:
+    /// Returns current wall time in nanoseconds
+    virtual std::chrono::nanoseconds GetTimeNS() = 0;
+
+    /// Returns current wall time in microseconds
+    virtual std::chrono::microseconds GetTimeUS() = 0;
+
+    /// Returns current wall time in milliseconds
+    virtual std::chrono::milliseconds GetTimeMS() = 0;
+
+    /// Returns current wall time in emulated clock cycles
+    virtual u64 GetClockCycles() = 0;
+
+    /// Returns current wall time in emulated cpu cycles
+    virtual u64 GetCPUCycles() = 0;
+
+    virtual void Pause(bool is_paused) = 0;
+
+    /// Tells if the wall clock, uses the host CPU's hardware clock
+    bool IsNative() const {
+        return is_native;
+    }
+
+protected:
+    WallClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, bool is_native)
+        : emulated_cpu_frequency{emulated_cpu_frequency},
+          emulated_clock_frequency{emulated_clock_frequency}, is_native{is_native} {}
+
+    u64 emulated_cpu_frequency;
+    u64 emulated_clock_frequency;
+
+private:
+    bool is_native;
+};
+
+std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
+                                                   u32 emulated_clock_frequency);
+
+} // namespace Common
--- a/src/common/x64/cpu_detect.cpp
+++ b/src/common/x64/cpu_detect.cpp
@@ -62,6 +62,17 @@ static CPUCaps Detect() {
    std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int));
    std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int));
    std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int));
+    if (cpu_id[1] == 0x756e6547 && cpu_id[2] == 0x6c65746e && cpu_id[3] == 0x49656e69)
+        caps.manufacturer = Manufacturer::Intel;
+    else if (cpu_id[1] == 0x68747541 && cpu_id[2] == 0x444d4163 && cpu_id[3] == 0x69746e65)
+        caps.manufacturer = Manufacturer::AMD;
+    else if (cpu_id[1] == 0x6f677948 && cpu_id[2] == 0x656e6975 && cpu_id[3] == 0x6e65476e)
+        caps.manufacturer = Manufacturer::Hygon;
+    else
+        caps.manufacturer = Manufacturer::Unknown;
+
+    u32 family = {};
+    u32 model = {};

    __cpuid(cpu_id, 0x80000000);

@@ -73,6 +84,14 @@ static CPUCaps Detect() {
    // Detect family and other miscellaneous features
    if (max_std_fn >= 1) {
        __cpuid(cpu_id, 0x00000001);
+        family = (cpu_id[0] >> 8) & 0xf;
+        model = (cpu_id[0] >> 4) & 0xf;
+        if (family == 0xf) {
+            family += (cpu_id[0] >> 20) & 0xff;
+        }
+        if (family >= 6) {
+            model += ((cpu_id[0] >> 16) & 0xf) << 4;
+        }

        if ((cpu_id[3] >> 25) & 1)
            caps.sse = true;
@@ -110,6 +129,11 @@ static CPUCaps Detect() {
                caps.bmi1 = true;
            if ((cpu_id[1] >> 8) & 1)
                caps.bmi2 = true;
+            // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP)
+            if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 &&
+                (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) {
+                caps.avx512 = caps.avx2;
+            }
        }
    }

@@ -130,6 +154,20 @@ static CPUCaps Detect() {
            caps.fma4 = true;
    }

+    if (max_ex_fn >= 0x80000007) {
+        __cpuid(cpu_id, 0x80000007);
+        if (cpu_id[3] & (1 << 8)) {
+            caps.invariant_tsc = true;
+        }
+    }
+
+    if (max_std_fn >= 0x16) {
+        __cpuid(cpu_id, 0x16);
+        caps.base_frequency = cpu_id[0];
+        caps.max_frequency = cpu_id[1];
+        caps.bus_frequency = cpu_id[2];
+    }
+
    return caps;
 }

--- a/src/common/x64/cpu_detect.h
+++ b/src/common/x64/cpu_detect.h
@@ -6,8 +6,16 @@

 namespace Common {

+enum class Manufacturer : u32 {
+    Intel = 0,
+    AMD = 1,
+    Hygon = 2,
+    Unknown = 3,
+};
+
 /// x86/x64 CPU capabilities that may be detected by this module
 struct CPUCaps {
+    Manufacturer manufacturer;
    char cpu_string[0x21];
    char brand_string[0x41];
    bool sse;
@@ -19,11 +27,16 @@ struct CPUCaps {
    bool lzcnt;
    bool avx;
    bool avx2;
+    bool avx512;
    bool bmi1;
    bool bmi2;
    bool fma;
    bool fma4;
    bool aes;
+    bool invariant_tsc;
+    u32 base_frequency;
+    u32 max_frequency;
+    u32 bus_frequency;
 };

 /**
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -0,0 +1,103 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <chrono>
+#include <mutex>
+#include <thread>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include "common/uint128.h"
+#include "common/x64/native_clock.h"
+
+namespace Common {
+
+u64 EstimateRDTSCFrequency() {
+    const auto milli_10 = std::chrono::milliseconds{10};
+    // get current time
+    _mm_mfence();
+    const u64 tscStart = __rdtsc();
+    const auto startTime = std::chrono::high_resolution_clock::now();
+    // wait roughly 3 seconds
+    while (true) {
+        auto milli = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::high_resolution_clock::now() - startTime);
+        if (milli.count() >= 3000)
+            break;
+        std::this_thread::sleep_for(milli_10);
+    }
+    const auto endTime = std::chrono::high_resolution_clock::now();
+    _mm_mfence();
+    const u64 tscEnd = __rdtsc();
+    // calculate difference
+    const u64 timer_diff =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime).count();
+    const u64 tsc_diff = tscEnd - tscStart;
+    const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff);
+    return tsc_freq;
+}
+
+namespace X64 {
+NativeClock::NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency,
+                         u64 rtsc_frequency)
+    : WallClock(emulated_cpu_frequency, emulated_clock_frequency, true), rtsc_frequency{
+                                                                             rtsc_frequency} {
+    _mm_mfence();
+    last_measure = __rdtsc();
+    accumulated_ticks = 0U;
+}
+
+u64 NativeClock::GetRTSC() {
+    std::scoped_lock scope{rtsc_serialize};
+    _mm_mfence();
+    const u64 current_measure = __rdtsc();
+    u64 diff = current_measure - last_measure;
+    diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
+    if (current_measure > last_measure) {
+        last_measure = current_measure;
+    }
+    accumulated_ticks += diff;
+    /// The clock cannot be more precise than the guest timer, remove the lower bits
+    return accumulated_ticks & inaccuracy_mask;
+}
+
+void NativeClock::Pause(bool is_paused) {
+    if (!is_paused) {
+        _mm_mfence();
+        last_measure = __rdtsc();
+    }
+}
+
+std::chrono::nanoseconds NativeClock::GetTimeNS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)};
+}
+
+std::chrono::microseconds NativeClock::GetTimeUS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)};
+}
+
+std::chrono::milliseconds NativeClock::GetTimeMS() {
+    const u64 rtsc_value = GetRTSC();
+    return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)};
+}
+
+u64 NativeClock::GetClockCycles() {
+    const u64 rtsc_value = GetRTSC();
+    return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency);
+}
+
+u64 NativeClock::GetCPUCycles() {
+    const u64 rtsc_value = GetRTSC();
+    return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency);
+}
+
+} // namespace X64
+
+} // namespace Common
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <optional>
+
+#include "common/spin_lock.h"
+#include "common/wall_clock.h"
+
+namespace Common {
+
+namespace X64 {
+class NativeClock : public WallClock {
+public:
+    NativeClock(u64 emulated_cpu_frequency, u64 emulated_clock_frequency, u64 rtsc_frequency);
+
+    std::chrono::nanoseconds GetTimeNS() override;
+
+    std::chrono::microseconds GetTimeUS() override;
+
+    std::chrono::milliseconds GetTimeMS() override;
+
+    u64 GetClockCycles() override;
+
+    u64 GetCPUCycles() override;
+
+    void Pause(bool is_paused) override;
+
+private:
+    u64 GetRTSC();
+
+    /// value used to reduce the native clocks accuracy as some apss rely on
+    /// undefined behavior where the level of accuracy in the clock shouldn't
+    /// be higher.
+    static constexpr u64 inaccuracy_mask = ~(0x400 - 1);
+
+    SpinLock rtsc_serialize{};
+    u64 last_measure{};
+    u64 accumulated_ticks{};
+    u64 rtsc_frequency;
+};
+} // namespace X64
+
+u64 EstimateRDTSCFrequency();
+
+} // namespace Common
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -11,7 +11,7 @@

 namespace Common::X64 {

-inline int RegToIndex(const Xbyak::Reg& reg) {
+inline std::size_t RegToIndex(const Xbyak::Reg& reg) {
    using Kind = Xbyak::Reg::Kind;
    ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
               "RegSet only support GPRs and XMM registers.");
@@ -19,17 +19,17 @@ inline int RegToIndex(const Xbyak::Reg& reg) {
    return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
 }

-inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) {
    ASSERT(reg_index < 16);
-    return Xbyak::Reg64(reg_index);
+    return Xbyak::Reg64(static_cast<int>(reg_index));
 }

-inline Xbyak::Xmm IndexToXmm(int reg_index) {
+inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) {
    ASSERT(reg_index >= 16 && reg_index < 32);
-    return Xbyak::Xmm(reg_index - 16);
+    return Xbyak::Xmm(static_cast<int>(reg_index - 16));
 }

-inline Xbyak::Reg IndexToReg(int reg_index) {
+inline Xbyak::Reg IndexToReg(std::size_t reg_index) {
    if (reg_index < 16) {
        return IndexToReg64(reg_index);
    } else {
@@ -151,9 +151,13 @@ constexpr size_t ABI_SHADOW_SPACE = 0;

 #endif

-inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
-                                   size_t needed_frame_size, s32* out_subtraction,
-                                   s32* out_xmm_offset) {
+struct ABIFrameInfo {
+    s32 subtraction;
+    s32 xmm_offset;
+};
+
+inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                           size_t needed_frame_size) {
    const auto count = (regs & ABI_ALL_GPRS).count();
    rsp_alignment -= count * 8;
    size_t subtraction = 0;
@@ -170,33 +174,28 @@ inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
    rsp_alignment -= subtraction;
    subtraction += rsp_alignment & 0xF;

-    *out_subtraction = (s32)subtraction;
-    *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+    return ABIFrameInfo{static_cast<s32>(subtraction),
+                        static_cast<s32>(subtraction - xmm_base_subtraction)};
 }

 inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
+
    for (std::size_t i = 0; i < regs.size(); ++i) {
        if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
-        }
-    }
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
-
-    for (int i = 0; i < regs.count(); i++) {
-        if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
            code.push(IndexToReg64(i));
        }
    }

+    if (frame_info.subtraction != 0) {
+        code.sub(code.rsp, frame_info.subtraction);
+    }
+
    for (std::size_t i = 0; i < regs.size(); ++i) {
        if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
-            xmm_offset += 0x10;
+            code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
+            frame_info.xmm_offset += 0x10;
        }
    }

@@ -205,59 +204,23 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b

 inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                           size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);

    for (std::size_t i = 0; i < regs.size(); ++i) {
        if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
-            xmm_offset += 0x10;
+            code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
+            frame_info.xmm_offset += 0x10;
        }
    }

-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
+    if (frame_info.subtraction != 0) {
+        code.add(code.rsp, frame_info.subtraction);
    }

    // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
-            code.pop(IndexToReg64(i));
-        }
-    }
-}
-
-inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                                 size_t rsp_alignment,
-                                                 size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (std::size_t j = 0; j < regs.size(); ++j) {
+        const std::size_t i = regs.size() - j - 1;
        if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
-        }
-    }
-
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
-
-    return ABI_SHADOW_SPACE;
-}
-
-inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
-    }
-
-    // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
            code.pop(IndexToReg64(i));
        }
    }
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -7,6 +7,16 @@ endif()
 add_library(core STATIC
    arm/arm_interface.h
    arm/arm_interface.cpp
+    arm/cpu_interrupt_handler.cpp
+    arm/cpu_interrupt_handler.h
+    arm/dynarmic/arm_dynarmic_32.cpp
+    arm/dynarmic/arm_dynarmic_32.h
+    arm/dynarmic/arm_dynarmic_64.cpp
+    arm/dynarmic/arm_dynarmic_64.h
+    arm/dynarmic/arm_dynarmic_cp15.cpp
+    arm/dynarmic/arm_dynarmic_cp15.h
+    arm/dynarmic/arm_exclusive_monitor.cpp
+    arm/dynarmic/arm_exclusive_monitor.h
    arm/exclusive_monitor.cpp
    arm/exclusive_monitor.h
    arm/unicorn/arm_unicorn.cpp
@@ -15,8 +25,6 @@ add_library(core STATIC
    constants.h
    core.cpp
    core.h
-    core_manager.cpp
-    core_manager.h
    core_timing.cpp
    core_timing.h
    core_timing_util.cpp
@@ -606,11 +614,11 @@ endif()
 create_target_directory_groups(core)

 target_link_libraries(core PUBLIC common PRIVATE audio_core video_core)
-target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn)
+target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls opus unicorn zip)

 if (YUZU_ENABLE_BOXCAT)
    target_compile_definitions(core PRIVATE -DYUZU_ENABLE_BOXCAT)
-    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json zip)
+    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json)
 endif()

 if (ENABLE_WEB_SERVICE)
--- a/src/core/arm/arm_interface.cpp
+++ b/src/core/arm/arm_interface.cpp
@@ -139,6 +139,63 @@ std::optional<std::string> GetSymbolName(const Symbols& symbols, VAddr func_addr

 constexpr u64 SEGMENT_BASE = 0x7100000000ull;

+std::vector<ARM_Interface::BacktraceEntry> ARM_Interface::GetBacktraceFromContext(
+    System& system, const ThreadContext64& ctx) {
+    std::vector<BacktraceEntry> out;
+    auto& memory = system.Memory();
+
+    auto fp = ctx.cpu_registers[29];
+    auto lr = ctx.cpu_registers[30];
+    while (true) {
+        out.push_back({"", 0, lr, 0});
+        if (!fp) {
+            break;
+        }
+        lr = memory.Read64(fp + 8) - 4;
+        fp = memory.Read64(fp);
+    }
+
+    std::map<VAddr, std::string> modules;
+    auto& loader{system.GetAppLoader()};
+    if (loader.ReadNSOModules(modules) != Loader::ResultStatus::Success) {
+        return {};
+    }
+
+    std::map<std::string, Symbols> symbols;
+    for (const auto& module : modules) {
+        symbols.insert_or_assign(module.second, GetSymbols(module.first, memory));
+    }
+
+    for (auto& entry : out) {
+        VAddr base = 0;
+        for (auto iter = modules.rbegin(); iter != modules.rend(); ++iter) {
+            const auto& module{*iter};
+            if (entry.original_address >= module.first) {
+                entry.module = module.second;
+                base = module.first;
+                break;
+            }
+        }
+
+        entry.offset = entry.original_address - base;
+        entry.address = SEGMENT_BASE + entry.offset;
+
+        if (entry.module.empty())
+            entry.module = "unknown";
+
+        const auto symbol_set = symbols.find(entry.module);
+        if (symbol_set != symbols.end()) {
+            const auto symbol = GetSymbolName(symbol_set->second, entry.offset);
+            if (symbol.has_value()) {
+                // TODO(DarkLordZach): Add demangling of symbol names.
+                entry.name = *symbol;
+            }
+        }
+    }
+
+    return out;
+}
+
 std::vector<ARM_Interface::BacktraceEntry> ARM_Interface::GetBacktrace() const {
    std::vector<BacktraceEntry> out;
    auto& memory = system.Memory();
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <vector>
 #include "common/common_types.h"
+#include "core/hardware_properties.h"

 namespace Common {
 struct PageTable;
@@ -18,25 +19,29 @@ enum class VMAPermission : u8;

 namespace Core {
 class System;
+class CPUInterruptHandler;
+
+using CPUInterrupts = std::array<CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>;

 /// Generic ARMv8 CPU interface
 class ARM_Interface : NonCopyable {
 public:
-    explicit ARM_Interface(System& system_) : system{system_} {}
+    explicit ARM_Interface(System& system_, CPUInterrupts& interrupt_handlers, bool uses_wall_clock)
+        : system{system_}, interrupt_handlers{interrupt_handlers}, uses_wall_clock{
+                                                                       uses_wall_clock} {}
    virtual ~ARM_Interface() = default;

    struct ThreadContext32 {
        std::array<u32, 16> cpu_registers{};
+        std::array<u32, 64> extension_registers{};
        u32 cpsr{};
-        std::array<u8, 4> padding{};
-        std::array<u64, 32> fprs{};
        u32 fpscr{};
        u32 fpexc{};
        u32 tpidr{};
    };
    // Internally within the kernel, it expects the AArch32 version of the
    // thread context to be 344 bytes in size.
-    static_assert(sizeof(ThreadContext32) == 0x158);
+    static_assert(sizeof(ThreadContext32) == 0x150);

    struct ThreadContext64 {
        std::array<u64, 31> cpu_registers{};
@@ -143,6 +148,8 @@ public:
     */
    virtual void SetTPIDR_EL0(u64 value) = 0;

+    virtual void ChangeProcessorID(std::size_t new_core_id) = 0;
+
    virtual void SaveContext(ThreadContext32& ctx) = 0;
    virtual void SaveContext(ThreadContext64& ctx) = 0;
    virtual void LoadContext(const ThreadContext32& ctx) = 0;
@@ -162,6 +169,9 @@ public:
        std::string name;
    };

+    static std::vector<BacktraceEntry> GetBacktraceFromContext(System& system,
+                                                               const ThreadContext64& ctx);
+
    std::vector<BacktraceEntry> GetBacktrace() const;

    /// fp (= r29) points to the last frame record.
@@ -175,6 +185,8 @@ public:
 protected:
    /// System context that this ARM interface is running under.
    System& system;
+    CPUInterrupts& interrupt_handlers;
+    bool uses_wall_clock;
 };

 } // namespace Core
--- a/src/core/arm/cpu_interrupt_handler.cpp
+++ b/src/core/arm/cpu_interrupt_handler.cpp
@@ -0,0 +1,27 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/thread.h"
+#include "core/arm/cpu_interrupt_handler.h"
+
+namespace Core {
+
+CPUInterruptHandler::CPUInterruptHandler() : is_interrupted{} {
+    interrupt_event = std::make_unique<Common::Event>();
+}
+
+CPUInterruptHandler::~CPUInterruptHandler() = default;
+
+void CPUInterruptHandler::SetInterrupt(bool is_interrupted_) {
+    if (is_interrupted_) {
+        interrupt_event->Set();
+    }
+    this->is_interrupted = is_interrupted_;
+}
+
+void CPUInterruptHandler::AwaitInterrupt() {
+    interrupt_event->Wait();
+}
+
+} // namespace Core
--- a/src/core/arm/cpu_interrupt_handler.h
+++ b/src/core/arm/cpu_interrupt_handler.h
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+namespace Common {
+class Event;
+}
+
+namespace Core {
+
+class CPUInterruptHandler {
+public:
+    CPUInterruptHandler();
+    ~CPUInterruptHandler();
+
+    CPUInterruptHandler(const CPUInterruptHandler&) = delete;
+    CPUInterruptHandler& operator=(const CPUInterruptHandler&) = delete;
+
+    CPUInterruptHandler(CPUInterruptHandler&&) = default;
+    CPUInterruptHandler& operator=(CPUInterruptHandler&&) = default;
+
+    bool IsInterrupted() const {
+        return is_interrupted;
+    }
+
+    void SetInterrupt(bool is_interrupted);
+
+    void AwaitInterrupt();
+
+private:
+    bool is_interrupted{};
+    std::unique_ptr<Common::Event> interrupt_event;
+};
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -7,15 +7,17 @@
 #include <dynarmic/A32/a32.h>
 #include <dynarmic/A32/config.h>
 #include <dynarmic/A32/context.h>
-#include "common/microprofile.h"
+#include "common/logging/log.h"
+#include "common/page_table.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
-#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/svc.h"
 #include "core/memory.h"
+#include "core/settings.h"

 namespace Core {

@@ -49,8 +51,22 @@ public:
        parent.system.Memory().Write64(vaddr, value);
    }

+    bool MemoryWriteExclusive8(u32 vaddr, u8 value, u8 expected) override {
+        return parent.system.Memory().WriteExclusive8(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive16(u32 vaddr, u16 value, u16 expected) override {
+        return parent.system.Memory().WriteExclusive16(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive32(u32 vaddr, u32 value, u32 expected) override {
+        return parent.system.Memory().WriteExclusive32(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive64(u32 vaddr, u64 value, u64 expected) override {
+        return parent.system.Memory().WriteExclusive64(vaddr, value, expected);
+    }
+
    void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("This should never happen, pc = {:08X}, code = {:08X}", pc,
+                          MemoryReadCode(pc));
    }

    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
@@ -61,7 +77,7 @@ public:
        case Dynarmic::A32::Exception::Breakpoint:
            break;
        }
-        LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+        LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
                     static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
        UNIMPLEMENTED();
    }
@@ -71,26 +87,36 @@ public:
    }

    void AddTicks(u64 ticks) override {
+        if (parent.uses_wall_clock) {
+            return;
+        }
        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
        // rough approximation of the amount of executed ticks in the system, it may be thrown off
        // if not all cores are doing a similar amount of work. Instead of doing this, we should
        // device a way so that timing is consistent across all cores without increasing the ticks 4
        // times.
-        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        u64 amortized_ticks =
+            (ticks - num_interpreted_instructions) / Core::Hardware::NUM_CPU_CORES;
        // Always execute at least one tick.
        amortized_ticks = std::max<u64>(amortized_ticks, 1);

        parent.system.CoreTiming().AddTicks(amortized_ticks);
        num_interpreted_instructions = 0;
    }
+
    u64 GetTicksRemaining() override {
-        return std::max(parent.system.CoreTiming().GetDowncount(), {});
+        if (parent.uses_wall_clock) {
+            if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) {
+                return minimum_run_cycles;
+            }
+            return 0U;
+        }
+        return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0);
    }

    ARM_Dynarmic_32& parent;
    std::size_t num_interpreted_instructions{};
-    u64 tpidrro_el0{};
-    u64 tpidr_el0{};
+    static constexpr u64 minimum_run_cycles = 1000U;
 };

 std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
@@ -99,26 +125,68 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable&
    config.callbacks = cb.get();
    // TODO(bunnei): Implement page table for 32-bit
    // config.page_table = &page_table.pointers;
-    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.coprocessors[15] = cp15;
    config.define_unpredictable_behaviour = true;
+    static constexpr std::size_t PAGE_BITS = 12;
+    static constexpr std::size_t NUM_PAGE_TABLE_ENTRIES = 1 << (32 - PAGE_BITS);
+    config.page_table = reinterpret_cast<std::array<std::uint8_t*, NUM_PAGE_TABLE_ENTRIES>*>(
+        page_table.pointers.data());
+    config.absolute_offset_page_table = true;
+    config.detect_misaligned_access_via_page_table = 16 | 32 | 64 | 128;
+    config.only_detect_misalignment_via_page_table_on_page_boundary = true;
+
+    // Multi-process state
+    config.processor_id = core_index;
+    config.global_monitor = &exclusive_monitor.monitor;
+
+    // Timing
+    config.wall_clock_cntpct = uses_wall_clock;
+
+    // Safe optimizations
+    if (Settings::values.cpu_accuracy != Settings::CPUAccuracy::Accurate) {
+        if (!Settings::values.cpuopt_page_tables) {
+            config.page_table = nullptr;
+        }
+        if (!Settings::values.cpuopt_block_linking) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::BlockLinking;
+        }
+        if (!Settings::values.cpuopt_return_stack_buffer) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::ReturnStackBuffer;
+        }
+        if (!Settings::values.cpuopt_fast_dispatcher) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::FastDispatch;
+        }
+        if (!Settings::values.cpuopt_context_elimination) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::GetSetElimination;
+        }
+        if (!Settings::values.cpuopt_const_prop) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::ConstProp;
+        }
+        if (!Settings::values.cpuopt_misc_ir) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::MiscIROpt;
+        }
+        if (!Settings::values.cpuopt_reduce_misalign_checks) {
+            config.only_detect_misalignment_via_page_table_on_page_boundary = false;
+        }
+    }
+
    return std::make_unique<Dynarmic::A32::Jit>(config);
 }

-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_32, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
-
 void ARM_Dynarmic_32::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_32);
    jit->Run();
 }

 void ARM_Dynarmic_32::Step() {
-    cb->InterpreterFallback(jit->Regs()[15], 1);
+    jit->Step();
 }

-ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
+ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, CPUInterrupts& interrupt_handlers,
+                                 bool uses_wall_clock, ExclusiveMonitor& exclusive_monitor,
                                 std::size_t core_index)
-    : ARM_Interface{system},
-      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock},
+      cb(std::make_unique<DynarmicCallbacks32>(*this)),
+      cp15(std::make_shared<DynarmicCP15>(*this)), core_index{core_index},
      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

 ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
@@ -154,32 +222,40 @@ void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
 }

 u64 ARM_Dynarmic_32::GetTlsAddress() const {
-    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+    return cp15->uro;
 }

 void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
-    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+    cp15->uro = static_cast<u32>(address);
 }

 u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
-    return cb->tpidr_el0;
+    return cp15->uprw;
 }

 void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
-    cb->tpidr_el0 = value;
+    cp15->uprw = static_cast<u32>(value);
+}
+
+void ARM_Dynarmic_32::ChangeProcessorID(std::size_t new_core_id) {
+    jit->ChangeProcessorID(new_core_id);
 }

 void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
    Dynarmic::A32::Context context;
    jit->SaveContext(context);
    ctx.cpu_registers = context.Regs();
+    ctx.extension_registers = context.ExtRegs();
    ctx.cpsr = context.Cpsr();
+    ctx.fpscr = context.Fpscr();
 }

 void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) {
    Dynarmic::A32::Context context;
    context.Regs() = ctx.cpu_registers;
+    context.ExtRegs() = ctx.extension_registers;
    context.SetCpsr(ctx.cpsr);
+    context.SetFpscr(ctx.fpscr);
    jit->LoadContext(context);
 }

@@ -188,10 +264,15 @@ void ARM_Dynarmic_32::PrepareReschedule() {
 }

 void ARM_Dynarmic_32::ClearInstructionCache() {
+    if (!jit) {
+        return;
+    }
    jit->ClearCache();
 }

-void ARM_Dynarmic_32::ClearExclusiveState() {}
+void ARM_Dynarmic_32::ClearExclusiveState() {
+    jit->ClearExclusiveState();
+}

 void ARM_Dynarmic_32::PageTableChanged(Common::PageTable& page_table,
                                       std::size_t new_address_space_size_in_bits) {
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -9,7 +9,7 @@

 #include <dynarmic/A32/a32.h>
 #include <dynarmic/A64/a64.h>
-#include <dynarmic/A64/exclusive_monitor.h>
+#include <dynarmic/exclusive_monitor.h>
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "core/arm/arm_interface.h"
@@ -21,13 +21,16 @@ class Memory;

 namespace Core {

+class CPUInterruptHandler;
 class DynarmicCallbacks32;
+class DynarmicCP15;
 class DynarmicExclusiveMonitor;
 class System;

 class ARM_Dynarmic_32 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_32(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
    ~ARM_Dynarmic_32() override;

    void SetPC(u64 pc) override;
@@ -44,6 +47,7 @@ public:
    void SetTlsAddress(VAddr address) override;
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;

    void SaveContext(ThreadContext32& ctx) override;
    void SaveContext(ThreadContext64& ctx) override {}
@@ -66,12 +70,14 @@ private:
        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;

    friend class DynarmicCallbacks32;
+    friend class DynarmicCP15;
+
    std::unique_ptr<DynarmicCallbacks32> cb;
    JitCacheType jit_cache;
    std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::shared_ptr<DynarmicCP15> cp15;
    std::size_t core_index;
    DynarmicExclusiveMonitor& exclusive_monitor;
-    std::array<u32, 84> CP15_regs{};
 };

 } // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -7,11 +7,11 @@
 #include <dynarmic/A64/a64.h>
 #include <dynarmic/A64/config.h>
 #include "common/logging/log.h"
-#include "common/microprofile.h"
 #include "common/page_table.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/gdbstub/gdbstub.h"
@@ -65,6 +65,22 @@ public:
        memory.Write64(vaddr + 8, value[1]);
    }

+    bool MemoryWriteExclusive8(u64 vaddr, std::uint8_t value, std::uint8_t expected) override {
+        return parent.system.Memory().WriteExclusive8(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive16(u64 vaddr, std::uint16_t value, std::uint16_t expected) override {
+        return parent.system.Memory().WriteExclusive16(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive32(u64 vaddr, std::uint32_t value, std::uint32_t expected) override {
+        return parent.system.Memory().WriteExclusive32(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive64(u64 vaddr, std::uint64_t value, std::uint64_t expected) override {
+        return parent.system.Memory().WriteExclusive64(vaddr, value, expected);
+    }
+    bool MemoryWriteExclusive128(u64 vaddr, Vector value, Vector expected) override {
+        return parent.system.Memory().WriteExclusive128(vaddr, value, expected);
+    }
+
    void InterpreterFallback(u64 pc, std::size_t num_instructions) override {
        LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc,
                 num_instructions, MemoryReadCode(pc));
@@ -98,8 +114,8 @@ public:
            }
            [[fallthrough]];
        default:
-            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:X})",
-                       static_cast<std::size_t>(exception), pc);
+            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+                       static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
        }
    }

@@ -108,29 +124,42 @@ public:
    }

    void AddTicks(u64 ticks) override {
+        if (parent.uses_wall_clock) {
+            return;
+        }
        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
        // rough approximation of the amount of executed ticks in the system, it may be thrown off
        // if not all cores are doing a similar amount of work. Instead of doing this, we should
        // device a way so that timing is consistent across all cores without increasing the ticks 4
        // times.
-        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        u64 amortized_ticks =
+            (ticks - num_interpreted_instructions) / Core::Hardware::NUM_CPU_CORES;
        // Always execute at least one tick.
        amortized_ticks = std::max<u64>(amortized_ticks, 1);

        parent.system.CoreTiming().AddTicks(amortized_ticks);
        num_interpreted_instructions = 0;
    }
+
    u64 GetTicksRemaining() override {
-        return std::max(parent.system.CoreTiming().GetDowncount(), s64{0});
+        if (parent.uses_wall_clock) {
+            if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) {
+                return minimum_run_cycles;
+            }
+            return 0U;
+        }
+        return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0);
    }
+
    u64 GetCNTPCT() override {
-        return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
+        return parent.system.CoreTiming().GetClockTicks();
    }

    ARM_Dynarmic_64& parent;
    std::size_t num_interpreted_instructions = 0;
    u64 tpidrro_el0 = 0;
    u64 tpidr_el0 = 0;
+    static constexpr u64 minimum_run_cycles = 1000U;
 };

 std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable& page_table,
@@ -162,20 +191,41 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable&
    // Unpredictable instructions
    config.define_unpredictable_behaviour = true;

-    // Optimizations
-    if (Settings::values.disable_cpu_opt) {
-        config.enable_optimizations = false;
-        config.enable_fast_dispatch = false;
+    // Timing
+    config.wall_clock_cntpct = uses_wall_clock;
+
+    // Safe optimizations
+    if (Settings::values.cpu_accuracy != Settings::CPUAccuracy::Accurate) {
+        if (!Settings::values.cpuopt_page_tables) {
+            config.page_table = nullptr;
+        }
+        if (!Settings::values.cpuopt_block_linking) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::BlockLinking;
+        }
+        if (!Settings::values.cpuopt_return_stack_buffer) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::ReturnStackBuffer;
+        }
+        if (!Settings::values.cpuopt_fast_dispatcher) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::FastDispatch;
+        }
+        if (!Settings::values.cpuopt_context_elimination) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::GetSetElimination;
+        }
+        if (!Settings::values.cpuopt_const_prop) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::ConstProp;
+        }
+        if (!Settings::values.cpuopt_misc_ir) {
+            config.optimizations &= ~Dynarmic::OptimizationFlag::MiscIROpt;
+        }
+        if (!Settings::values.cpuopt_reduce_misalign_checks) {
+            config.only_detect_misalignment_via_page_table_on_page_boundary = false;
+        }
    }

    return std::make_shared<Dynarmic::A64::Jit>(config);
 }

-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_64, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
-
 void ARM_Dynarmic_64::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_64);
-
    jit->Run();
 }

@@ -183,11 +233,16 @@ void ARM_Dynarmic_64::Step() {
    cb->InterpreterFallback(jit->GetPC(), 1);
 }

-ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor,
+ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, CPUInterrupts& interrupt_handlers,
+                                 bool uses_wall_clock, ExclusiveMonitor& exclusive_monitor,
                                 std::size_t core_index)
-    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks64>(*this)),
-      inner_unicorn{system, ARM_Unicorn::Arch::AArch64}, core_index{core_index},
-      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock},
+      cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system, interrupt_handlers,
+                                                                      uses_wall_clock,
+                                                                      ARM_Unicorn::Arch::AArch64,
+                                                                      core_index},
+      core_index{core_index}, exclusive_monitor{
+                                  dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

 ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;

@@ -239,6 +294,10 @@ void ARM_Dynarmic_64::SetTPIDR_EL0(u64 value) {
    cb->tpidr_el0 = value;
 }

+void ARM_Dynarmic_64::ChangeProcessorID(std::size_t new_core_id) {
+    jit->ChangeProcessorID(new_core_id);
+}
+
 void ARM_Dynarmic_64::SaveContext(ThreadContext64& ctx) {
    ctx.cpu_registers = jit->GetRegisters();
    ctx.sp = jit->GetSP();
@@ -266,6 +325,9 @@ void ARM_Dynarmic_64::PrepareReschedule() {
 }

 void ARM_Dynarmic_64::ClearInstructionCache() {
+    if (!jit) {
+        return;
+    }
    jit->ClearCache();
 }

@@ -285,44 +347,4 @@ void ARM_Dynarmic_64::PageTableChanged(Common::PageTable& page_table,
    jit_cache.emplace(key, jit);
 }

-DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
-    : monitor(core_count), memory{memory} {}
-
-DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;
-
-void DynarmicExclusiveMonitor::SetExclusive(std::size_t core_index, VAddr addr) {
-    // Size doesn't actually matter.
-    monitor.Mark(core_index, addr, 16);
-}
-
-void DynarmicExclusiveMonitor::ClearExclusive() {
-    monitor.Clear();
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 1, [&] { memory.Write8(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 2,
-                                        [&] { memory.Write16(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 4,
-                                        [&] { memory.Write32(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 8,
-                                        [&] { memory.Write64(vaddr, value); });
-}
-
-bool DynarmicExclusiveMonitor::ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) {
-    return monitor.DoExclusiveOperation(core_index, vaddr, 16, [&] {
-        memory.Write64(vaddr + 0, value[0]);
-        memory.Write64(vaddr + 8, value[1]);
-    });
-}
-
 } // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -8,7 +8,6 @@
 #include <unordered_map>

 #include <dynarmic/A64/a64.h>
-#include <dynarmic/A64/exclusive_monitor.h>
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "core/arm/arm_interface.h"
@@ -22,12 +21,14 @@ class Memory;
 namespace Core {

 class DynarmicCallbacks64;
+class CPUInterruptHandler;
 class DynarmicExclusiveMonitor;
 class System;

 class ARM_Dynarmic_64 final : public ARM_Interface {
 public:
-    ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ARM_Dynarmic_64(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                    ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
    ~ARM_Dynarmic_64() override;

    void SetPC(u64 pc) override;
@@ -44,6 +45,7 @@ public:
    void SetTlsAddress(VAddr address) override;
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;

    void SaveContext(ThreadContext32& ctx) override {}
    void SaveContext(ThreadContext64& ctx) override;
@@ -75,24 +77,4 @@ private:
    DynarmicExclusiveMonitor& exclusive_monitor;
 };

-class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
-public:
-    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
-    ~DynarmicExclusiveMonitor() override;
-
-    void SetExclusive(std::size_t core_index, VAddr addr) override;
-    void ClearExclusive() override;
-
-    bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override;
-    bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override;
-    bool ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) override;
-    bool ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) override;
-    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;
-
-private:
-    friend class ARM_Dynarmic_64;
-    Dynarmic::A64::ExclusiveMonitor monitor;
-    Core::Memory::Memory& memory;
-};
-
 } // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
@@ -2,79 +2,132 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <fmt/format.h>
+#include "common/logging/log.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"

 using Callback = Dynarmic::A32::Coprocessor::Callback;
 using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
 using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;

+template <>
+struct fmt::formatter<Dynarmic::A32::CoprocReg> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.begin();
+    }
+    template <typename FormatContext>
+    auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) {
+        return format_to(ctx.out(), "cp{}", static_cast<size_t>(reg));
+    }
+};
+
+namespace Core {
+
+static u32 dummy_value;
+
 std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
                                                               CoprocReg CRd, CoprocReg CRn,
                                                               CoprocReg CRm, unsigned opc2) {
+    LOG_CRITICAL(Core_ARM, "CP15: cdp{} p15, {}, {}, {}, {}, {}", two ? "2" : "", opc1, CRd, CRn,
+                 CRm, opc2);
    return {};
 }

 CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                         CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // CP15_FLUSH_PREFETCH_BUFFER
        // This is a dummy write, we ignore the value written here.
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+        return &dummy_value;
    }

    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
        switch (opc2) {
        case 4:
+            // CP15_DATA_SYNC_BARRIER
            // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+            return &dummy_value;
        case 5:
+            // CP15_DATA_MEMORY_BARRIER
            // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
-        default:
-            return {};
+            return &dummy_value;
        }
    }

    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        // CP15_THREAD_UPRW
+        return &uprw;
    }

+    LOG_CRITICAL(Core_ARM, "CP15: mcr{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
    return {};
 }

 CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    LOG_CRITICAL(Core_ARM, "CP15: mcrr{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
    return {};
 }

 CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                        CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
        switch (opc2) {
        case 2:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+            // CP15_THREAD_UPRW
+            return &uprw;
        case 3:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
-        default:
-            return {};
+            // CP15_THREAD_URO
+            return &uro;
        }
    }

+    LOG_CRITICAL(Core_ARM, "CP15: mrc{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
    return {};
 }

 CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    if (!two && opc == 0 && CRm == CoprocReg::C14) {
+        // CNTPCT
+        const auto callback = static_cast<u64 (*)(Dynarmic::A32::Jit*, void*, u32, u32)>(
+            [](Dynarmic::A32::Jit*, void* arg, u32, u32) -> u64 {
+                ARM_Dynarmic_32& parent = *(ARM_Dynarmic_32*)arg;
+                return parent.system.CoreTiming().GetClockTicks();
+            });
+        return Dynarmic::A32::Coprocessor::Callback{callback, (void*)&parent};
+    }
+
+    LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
    return {};
 }

 std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
                                                       std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
    return {};
 }

 std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                                        std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
    return {};
 }
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
@@ -10,128 +10,15 @@
 #include <dynarmic/A32/coprocessor.h>
 #include "common/common_types.h"

-enum class CP15Register {
-    // c0 - Information registers
-    CP15_MAIN_ID,
-    CP15_CACHE_TYPE,
-    CP15_TCM_STATUS,
-    CP15_TLB_TYPE,
-    CP15_CPU_ID,
-    CP15_PROCESSOR_FEATURE_0,
-    CP15_PROCESSOR_FEATURE_1,
-    CP15_DEBUG_FEATURE_0,
-    CP15_AUXILIARY_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_1,
-    CP15_MEMORY_MODEL_FEATURE_2,
-    CP15_MEMORY_MODEL_FEATURE_3,
-    CP15_ISA_FEATURE_0,
-    CP15_ISA_FEATURE_1,
-    CP15_ISA_FEATURE_2,
-    CP15_ISA_FEATURE_3,
-    CP15_ISA_FEATURE_4,
+namespace Core {

-    // c1 - Control registers
-    CP15_CONTROL,
-    CP15_AUXILIARY_CONTROL,
-    CP15_COPROCESSOR_ACCESS_CONTROL,
-
-    // c2 - Translation table registers
-    CP15_TRANSLATION_BASE_TABLE_0,
-    CP15_TRANSLATION_BASE_TABLE_1,
-    CP15_TRANSLATION_BASE_CONTROL,
-    CP15_DOMAIN_ACCESS_CONTROL,
-    CP15_RESERVED,
-
-    // c5 - Fault status registers
-    CP15_FAULT_STATUS,
-    CP15_INSTR_FAULT_STATUS,
-    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
-    CP15_INST_FSR,
-
-    // c6 - Fault Address registers
-    CP15_FAULT_ADDRESS,
-    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
-    CP15_WFAR,
-    CP15_IFAR,
-
-    // c7 - Cache operation registers
-    CP15_WAIT_FOR_INTERRUPT,
-    CP15_PHYS_ADDRESS,
-    CP15_INVALIDATE_INSTR_CACHE,
-    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
-    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
-    CP15_FLUSH_PREFETCH_BUFFER,
-    CP15_FLUSH_BRANCH_TARGET_CACHE,
-    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
-    CP15_INVALIDATE_DATA_CACHE,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
-    CP15_CLEAN_DATA_CACHE,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
-    CP15_DATA_SYNC_BARRIER,
-    CP15_DATA_MEMORY_BARRIER,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-
-    // c8 - TLB operations
-    CP15_INVALIDATE_ITLB,
-    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_DTLB,
-    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_UTLB,
-    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
-
-    // c9 - Data cache lockdown register
-    CP15_DATA_CACHE_LOCKDOWN,
-
-    // c10 - TLB/Memory map registers
-    CP15_TLB_LOCKDOWN,
-    CP15_PRIMARY_REGION_REMAP,
-    CP15_NORMAL_REGION_REMAP,
-
-    // c13 - Thread related registers
-    CP15_PID,
-    CP15_CONTEXT_ID,
-    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
-    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
-    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
-
-    // c15 - Performance and TLB lockdown registers
-    CP15_PERFORMANCE_MONITOR_CONTROL,
-    CP15_CYCLE_COUNTER,
-    CP15_COUNT_0,
-    CP15_COUNT_1,
-    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
-    CP15_TLB_DEBUG_CONTROL,
-
-    // Skyeye defined
-    CP15_TLB_FAULT_ADDR,
-    CP15_TLB_FAULT_STATUS,
-
-    // Not an actual register.
-    // All registers should be defined above this.
-    CP15_REGISTER_COUNT,
-};
+class ARM_Dynarmic_32;

 class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
 public:
    using CoprocReg = Dynarmic::A32::CoprocReg;

-    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+    explicit DynarmicCP15(ARM_Dynarmic_32& parent) : parent(parent) {}

    std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
                                                     CoprocReg CRn, CoprocReg CRm,
@@ -147,6 +34,9 @@ public:
    std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                              std::optional<u8> option) override;

-private:
-    u32* CP15{};
+    ARM_Dynarmic_32& parent;
+    u32 uprw;
+    u32 uro;
 };
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
@@ -0,0 +1,76 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+#include <memory>
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
+#include "core/memory.h"
+
+namespace Core {
+
+DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
+    : monitor(core_count), memory{memory} {}
+
+DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;
+
+u8 DynarmicExclusiveMonitor::ExclusiveRead8(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u8>(core_index, addr, [&]() -> u8 { return memory.Read8(addr); });
+}
+
+u16 DynarmicExclusiveMonitor::ExclusiveRead16(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u16>(core_index, addr, [&]() -> u16 { return memory.Read16(addr); });
+}
+
+u32 DynarmicExclusiveMonitor::ExclusiveRead32(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u32>(core_index, addr, [&]() -> u32 { return memory.Read32(addr); });
+}
+
+u64 DynarmicExclusiveMonitor::ExclusiveRead64(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u64>(core_index, addr, [&]() -> u64 { return memory.Read64(addr); });
+}
+
+u128 DynarmicExclusiveMonitor::ExclusiveRead128(std::size_t core_index, VAddr addr) {
+    return monitor.ReadAndMark<u128>(core_index, addr, [&]() -> u128 {
+        u128 result;
+        result[0] = memory.Read64(addr);
+        result[1] = memory.Read64(addr + 8);
+        return result;
+    });
+}
+
+void DynarmicExclusiveMonitor::ClearExclusive() {
+    monitor.Clear();
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) {
+    return monitor.DoExclusiveOperation<u8>(core_index, vaddr, [&](u8 expected) -> bool {
+        return memory.WriteExclusive8(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) {
+    return monitor.DoExclusiveOperation<u16>(core_index, vaddr, [&](u16 expected) -> bool {
+        return memory.WriteExclusive16(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) {
+    return monitor.DoExclusiveOperation<u32>(core_index, vaddr, [&](u32 expected) -> bool {
+        return memory.WriteExclusive32(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) {
+    return monitor.DoExclusiveOperation<u64>(core_index, vaddr, [&](u64 expected) -> bool {
+        return memory.WriteExclusive64(vaddr, value, expected);
+    });
+}
+
+bool DynarmicExclusiveMonitor::ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) {
+    return monitor.DoExclusiveOperation<u128>(core_index, vaddr, [&](u128 expected) -> bool {
+        return memory.WriteExclusive128(vaddr, value, expected);
+    });
+}
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_exclusive_monitor.h
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include <dynarmic/exclusive_monitor.h>
+
+#include "common/common_types.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/exclusive_monitor.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Core {
+
+class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
+public:
+    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
+    ~DynarmicExclusiveMonitor() override;
+
+    u8 ExclusiveRead8(std::size_t core_index, VAddr addr) override;
+    u16 ExclusiveRead16(std::size_t core_index, VAddr addr) override;
+    u32 ExclusiveRead32(std::size_t core_index, VAddr addr) override;
+    u64 ExclusiveRead64(std::size_t core_index, VAddr addr) override;
+    u128 ExclusiveRead128(std::size_t core_index, VAddr addr) override;
+    void ClearExclusive() override;
+
+    bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override;
+    bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override;
+    bool ExclusiveWrite32(std::size_t core_index, VAddr vaddr, u32 value) override;
+    bool ExclusiveWrite64(std::size_t core_index, VAddr vaddr, u64 value) override;
+    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;
+
+private:
+    friend class ARM_Dynarmic_32;
+    friend class ARM_Dynarmic_64;
+    Dynarmic::ExclusiveMonitor monitor;
+    Core::Memory::Memory& memory;
+};
+
+} // namespace Core
--- a/src/core/arm/exclusive_monitor.cpp
+++ b/src/core/arm/exclusive_monitor.cpp
@@ -3,7 +3,7 @@
 // Refer to the license.txt file included.

 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/memory.h"
--- a/src/core/arm/exclusive_monitor.h
+++ b/src/core/arm/exclusive_monitor.h
@@ -18,7 +18,11 @@ class ExclusiveMonitor {
 public:
    virtual ~ExclusiveMonitor();

-    virtual void SetExclusive(std::size_t core_index, VAddr addr) = 0;
+    virtual u8 ExclusiveRead8(std::size_t core_index, VAddr addr) = 0;
+    virtual u16 ExclusiveRead16(std::size_t core_index, VAddr addr) = 0;
+    virtual u32 ExclusiveRead32(std::size_t core_index, VAddr addr) = 0;
+    virtual u64 ExclusiveRead64(std::size_t core_index, VAddr addr) = 0;
+    virtual u128 ExclusiveRead128(std::size_t core_index, VAddr addr) = 0;
    virtual void ClearExclusive() = 0;

    virtual bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) = 0;
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -6,6 +6,7 @@
 #include <unicorn/arm64.h>
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -62,7 +63,9 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si
    return false;
 }

-ARM_Unicorn::ARM_Unicorn(System& system, Arch architecture) : ARM_Interface{system} {
+ARM_Unicorn::ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                         Arch architecture, std::size_t core_index)
+    : ARM_Interface{system, interrupt_handlers, uses_wall_clock}, core_index{core_index} {
    const auto arch = architecture == Arch::AArch32 ? UC_ARCH_ARM : UC_ARCH_ARM64;
    CHECKED(uc_open(arch, UC_MODE_ARM, &uc));

@@ -156,12 +159,20 @@ void ARM_Unicorn::SetTPIDR_EL0(u64 value) {
    CHECKED(uc_reg_write(uc, UC_ARM64_REG_TPIDR_EL0, &value));
 }

+void ARM_Unicorn::ChangeProcessorID(std::size_t new_core_id) {
+    core_index = new_core_id;
+}
+
 void ARM_Unicorn::Run() {
    if (GDBStub::IsServerEnabled()) {
        ExecuteInstructions(std::max(4000000U, 0U));
    } else {
-        ExecuteInstructions(
-            std::max(std::size_t(system.CoreTiming().GetDowncount()), std::size_t{0}));
+        while (true) {
+            if (interrupt_handlers[core_index].IsInterrupted()) {
+                return;
+            }
+            ExecuteInstructions(10);
+        }
    }
 }

@@ -183,8 +194,6 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
                           UC_PROT_READ | UC_PROT_WRITE | UC_PROT_EXEC, page_buffer.data()));
    CHECKED(uc_emu_start(uc, GetPC(), 1ULL << 63, 0, num_instructions));
    CHECKED(uc_mem_unmap(uc, map_addr, page_buffer.size()));
-
-    system.CoreTiming().AddTicks(num_instructions);
    if (GDBStub::IsServerEnabled()) {
        if (last_bkpt_hit && last_bkpt.type == GDBStub::BreakpointType::Execute) {
            uc_reg_write(uc, UC_ARM64_REG_PC, &last_bkpt.address);
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -20,7 +20,8 @@ public:
        AArch64, // 64-bit ARM
    };

-    explicit ARM_Unicorn(System& system, Arch architecture);
+    explicit ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
+                         Arch architecture, std::size_t core_index);
    ~ARM_Unicorn() override;

    void SetPC(u64 pc) override;
@@ -35,6 +36,7 @@ public:
    void SetTlsAddress(VAddr address) override;
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;
+    void ChangeProcessorID(std::size_t new_core_id) override;
    void PrepareReschedule() override;
    void ClearExclusiveState() override;
    void ExecuteInstructions(std::size_t num_instructions);
@@ -55,6 +57,7 @@ private:
    uc_engine* uc{};
    GDBStub::BreakpointAddress last_bkpt{};
    bool last_bkpt_hit = false;
+    std::size_t core_index;
 };

 } // namespace Core
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -8,10 +8,10 @@

 #include "common/file_util.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "common/string_util.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/device_memory.h"
@@ -51,6 +51,11 @@
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"

+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU0, "ARM JIT", "Dynarmic CPU 0", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU1, "ARM JIT", "Dynarmic CPU 1", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU2, "ARM JIT", "Dynarmic CPU 2", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_CPU3, "ARM JIT", "Dynarmic CPU 3", MP_RGB(255, 64, 64));
+
 namespace Core {

 namespace {
@@ -117,23 +122,22 @@ struct System::Impl {
        : kernel{system}, fs_controller{system}, memory{system},
          cpu_manager{system}, reporter{system}, applet_manager{system} {}

-    CoreManager& CurrentCoreManager() {
-        return cpu_manager.GetCurrentCoreManager();
-    }
-
-    Kernel::PhysicalCore& CurrentPhysicalCore() {
-        const auto index = cpu_manager.GetActiveCoreIndex();
-        return kernel.PhysicalCore(index);
-    }
-
-    Kernel::PhysicalCore& GetPhysicalCore(std::size_t index) {
-        return kernel.PhysicalCore(index);
-    }
-
-    ResultStatus RunLoop(bool tight_loop) {
+    ResultStatus Run() {
        status = ResultStatus::Success;

-        cpu_manager.RunLoop(tight_loop);
+        kernel.Suspend(false);
+        core_timing.SyncPause(false);
+        cpu_manager.Pause(false);
+
+        return status;
+    }
+
+    ResultStatus Pause() {
+        status = ResultStatus::Success;
+
+        core_timing.SyncPause(true);
+        kernel.Suspend(true);
+        cpu_manager.Pause(true);

        return status;
    }
@@ -143,14 +147,22 @@ struct System::Impl {

        device_memory = std::make_unique<Core::DeviceMemory>(system);

-        core_timing.Initialize();
+        is_multicore = Settings::values.use_multi_core.GetValue();
+        is_async_gpu = is_multicore || Settings::values.use_asynchronous_gpu_emulation.GetValue();
+
+        kernel.SetMulticore(is_multicore);
+        cpu_manager.SetMulticore(is_multicore);
+        cpu_manager.SetAsyncGpu(is_async_gpu);
+        core_timing.SetMulticore(is_multicore);
+
+        core_timing.Initialize([&system]() { system.RegisterHostThread(); });
        kernel.Initialize();
        cpu_manager.Initialize();

        const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
            std::chrono::system_clock::now().time_since_epoch());
        Settings::values.custom_rtc_differential =
-            Settings::values.custom_rtc.value_or(current_time) - current_time;
+            Settings::values.custom_rtc.GetValue().value_or(current_time) - current_time;

        // Create a default fs if one doesn't already exist.
        if (virtual_filesystem == nullptr)
@@ -180,6 +192,11 @@ struct System::Impl {
        is_powered_on = true;
        exit_lock = false;

+        microprofile_dynarmic[0] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU0);
+        microprofile_dynarmic[1] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU1);
+        microprofile_dynarmic[2] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU2);
+        microprofile_dynarmic[3] = MICROPROFILE_TOKEN(ARM_Jit_Dynarmic_CPU3);
+
        LOG_DEBUG(Core, "Initialized OK");

        return ResultStatus::Success;
@@ -277,8 +294,6 @@ struct System::Impl {
        service_manager.reset();
        cheat_engine.reset();
        telemetry_session.reset();
-        perf_stats.reset();
-        gpu_core.reset();
        device_memory.reset();

        // Close all CPU/threading state
@@ -290,6 +305,8 @@ struct System::Impl {

        // Close app loader
        app_loader.reset();
+        gpu_core.reset();
+        perf_stats.reset();

        // Clear all applets
        applet_manager.ClearAll();
@@ -382,25 +399,35 @@ struct System::Impl {

    std::unique_ptr<Core::PerfStats> perf_stats;
    Core::FrameLimiter frame_limiter;
+
+    bool is_multicore{};
+    bool is_async_gpu{};
+
+    std::array<u64, Core::Hardware::NUM_CPU_CORES> dynarmic_ticks{};
+    std::array<MicroProfileToken, Core::Hardware::NUM_CPU_CORES> microprofile_dynarmic{};
 };

 System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;

-CoreManager& System::CurrentCoreManager() {
-    return impl->CurrentCoreManager();
+CpuManager& System::GetCpuManager() {
+    return impl->cpu_manager;
 }

-const CoreManager& System::CurrentCoreManager() const {
-    return impl->CurrentCoreManager();
+const CpuManager& System::GetCpuManager() const {
+    return impl->cpu_manager;
 }

-System::ResultStatus System::RunLoop(bool tight_loop) {
-    return impl->RunLoop(tight_loop);
+System::ResultStatus System::Run() {
+    return impl->Run();
+}
+
+System::ResultStatus System::Pause() {
+    return impl->Pause();
 }

 System::ResultStatus System::SingleStep() {
-    return RunLoop(false);
+    return ResultStatus::Success;
 }

 void System::InvalidateCpuInstructionCaches() {
@@ -416,7 +443,7 @@ bool System::IsPoweredOn() const {
 }

 void System::PrepareReschedule() {
-    impl->CurrentPhysicalCore().Stop();
+    // Deprecated, does nothing, kept for backward compatibility.
 }

 void System::PrepareReschedule(const u32 core_index) {
@@ -436,31 +463,41 @@ const TelemetrySession& System::TelemetrySession() const {
 }

 ARM_Interface& System::CurrentArmInterface() {
-    return impl->CurrentPhysicalCore().ArmInterface();
+    return impl->kernel.CurrentScheduler().GetCurrentThread()->ArmInterface();
 }

 const ARM_Interface& System::CurrentArmInterface() const {
-    return impl->CurrentPhysicalCore().ArmInterface();
+    return impl->kernel.CurrentScheduler().GetCurrentThread()->ArmInterface();
 }

 std::size_t System::CurrentCoreIndex() const {
-    return impl->cpu_manager.GetActiveCoreIndex();
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    ASSERT(core < Core::Hardware::NUM_CPU_CORES);
+    return core;
 }

 Kernel::Scheduler& System::CurrentScheduler() {
-    return impl->CurrentPhysicalCore().Scheduler();
+    return impl->kernel.CurrentScheduler();
 }

 const Kernel::Scheduler& System::CurrentScheduler() const {
-    return impl->CurrentPhysicalCore().Scheduler();
+    return impl->kernel.CurrentScheduler();
+}
+
+Kernel::PhysicalCore& System::CurrentPhysicalCore() {
+    return impl->kernel.CurrentPhysicalCore();
+}
+
+const Kernel::PhysicalCore& System::CurrentPhysicalCore() const {
+    return impl->kernel.CurrentPhysicalCore();
 }

 Kernel::Scheduler& System::Scheduler(std::size_t core_index) {
-    return impl->GetPhysicalCore(core_index).Scheduler();
+    return impl->kernel.Scheduler(core_index);
 }

 const Kernel::Scheduler& System::Scheduler(std::size_t core_index) const {
-    return impl->GetPhysicalCore(core_index).Scheduler();
+    return impl->kernel.Scheduler(core_index);
 }

 /// Gets the global scheduler
@@ -490,20 +527,15 @@ const Kernel::Process* System::CurrentProcess() const {
 }

 ARM_Interface& System::ArmInterface(std::size_t core_index) {
-    return impl->GetPhysicalCore(core_index).ArmInterface();
+    auto* thread = impl->kernel.Scheduler(core_index).GetCurrentThread();
+    ASSERT(thread && !thread->IsHLEThread());
+    return thread->ArmInterface();
 }

 const ARM_Interface& System::ArmInterface(std::size_t core_index) const {
-    return impl->GetPhysicalCore(core_index).ArmInterface();
-}
-
-CoreManager& System::GetCoreManager(std::size_t core_index) {
-    return impl->cpu_manager.GetCoreManager(core_index);
-}
-
-const CoreManager& System::GetCoreManager(std::size_t core_index) const {
-    ASSERT(core_index < NUM_CPU_CORES);
-    return impl->cpu_manager.GetCoreManager(core_index);
+    auto* thread = impl->kernel.Scheduler(core_index).GetCurrentThread();
+    ASSERT(thread && !thread->IsHLEThread());
+    return thread->ArmInterface();
 }

 ExclusiveMonitor& System::Monitor() {
@@ -722,4 +754,18 @@ void System::RegisterHostThread() {
    impl->kernel.RegisterHostThread();
 }

+void System::EnterDynarmicProfile() {
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    impl->dynarmic_ticks[core] = MicroProfileEnter(impl->microprofile_dynarmic[core]);
+}
+
+void System::ExitDynarmicProfile() {
+    std::size_t core = impl->kernel.GetCurrentHostThreadID();
+    MicroProfileLeave(impl->microprofile_dynarmic[core], impl->dynarmic_ticks[core]);
+}
+
+bool System::IsMulticore() const {
+    return impl->is_multicore;
+}
+
 } // namespace Core
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -27,6 +27,7 @@ class VfsFilesystem;
 namespace Kernel {
 class GlobalScheduler;
 class KernelCore;
+class PhysicalCore;
 class Process;
 class Scheduler;
 } // namespace Kernel
@@ -90,7 +91,7 @@ class InterruptManager;
 namespace Core {

 class ARM_Interface;
-class CoreManager;
+class CpuManager;
 class DeviceMemory;
 class ExclusiveMonitor;
 class FrameLimiter;
@@ -136,16 +137,16 @@ public:
    };

    /**
-     * Run the core CPU loop
-     * This function runs the core for the specified number of CPU instructions before trying to
-     * update hardware. This is much faster than SingleStep (and should be equivalent), as the CPU
-     * is not required to do a full dispatch with each instruction. NOTE: the number of instructions
-     * requested is not guaranteed to run, as this will be interrupted preemptively if a hardware
-     * update is requested (e.g. on a thread switch).
-     * @param tight_loop If false, the CPU single-steps.
-     * @return Result status, indicating whether or not the operation succeeded.
+     * Run the OS and Application
+     * This function will start emulation and run the relevant devices
     */
-    ResultStatus RunLoop(bool tight_loop = true);
+    ResultStatus Run();
+
+    /**
+     * Pause the OS and Application
+     * This function will pause emulation and stop the relevant devices
+     */
+    ResultStatus Pause();

    /**
     * Step the CPU one instruction
@@ -209,17 +210,21 @@ public:
    /// Gets the scheduler for the CPU core that is currently running
    const Kernel::Scheduler& CurrentScheduler() const;

+    /// Gets the physical core for the CPU core that is currently running
+    Kernel::PhysicalCore& CurrentPhysicalCore();
+
+    /// Gets the physical core for the CPU core that is currently running
+    const Kernel::PhysicalCore& CurrentPhysicalCore() const;
+
    /// Gets a reference to an ARM interface for the CPU core with the specified index
    ARM_Interface& ArmInterface(std::size_t core_index);

    /// Gets a const reference to an ARM interface from the CPU core with the specified index
    const ARM_Interface& ArmInterface(std::size_t core_index) const;

-    /// Gets a CPU interface to the CPU core with the specified index
-    CoreManager& GetCoreManager(std::size_t core_index);
+    CpuManager& GetCpuManager();

-    /// Gets a CPU interface to the CPU core with the specified index
-    const CoreManager& GetCoreManager(std::size_t core_index) const;
+    const CpuManager& GetCpuManager() const;

    /// Gets a reference to the exclusive monitor
    ExclusiveMonitor& Monitor();
@@ -370,15 +375,18 @@ public:
    /// Register a host thread as an auxiliary thread.
    void RegisterHostThread();

+    /// Enter Dynarmic Microprofile
+    void EnterDynarmicProfile();
+
+    /// Exit Dynarmic Microprofile
+    void ExitDynarmicProfile();
+
+    /// Tells if system is running on multicore.
+    bool IsMulticore() const;
+
 private:
    System();

-    /// Returns the currently running CPU core
-    CoreManager& CurrentCoreManager();
-
-    /// Returns the currently running CPU core
-    const CoreManager& CurrentCoreManager() const;
-
    /**
     * Initialize the emulated system.
     * @param emu_window Reference to the host-system window used for video output and keyboard
--- a/src/core/core_manager.cpp
+++ b/src/core/core_manager.cpp
@@ -1,67 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <condition_variable>
-#include <mutex>
-
-#include "common/logging/log.h"
-#include "core/arm/exclusive_monitor.h"
-#include "core/arm/unicorn/arm_unicorn.h"
-#include "core/core.h"
-#include "core/core_manager.h"
-#include "core/core_timing.h"
-#include "core/hle/kernel/kernel.h"
-#include "core/hle/kernel/physical_core.h"
-#include "core/hle/kernel/scheduler.h"
-#include "core/hle/kernel/thread.h"
-#include "core/hle/lock.h"
-#include "core/settings.h"
-
-namespace Core {
-
-CoreManager::CoreManager(System& system, std::size_t core_index)
-    : global_scheduler{system.GlobalScheduler()}, physical_core{system.Kernel().PhysicalCore(
-                                                      core_index)},
-      core_timing{system.CoreTiming()}, core_index{core_index} {}
-
-CoreManager::~CoreManager() = default;
-
-void CoreManager::RunLoop(bool tight_loop) {
-    Reschedule();
-
-    // If we don't have a currently active thread then don't execute instructions,
-    // instead advance to the next event and try to yield to the next thread
-    if (Kernel::GetCurrentThread() == nullptr) {
-        LOG_TRACE(Core, "Core-{} idling", core_index);
-        core_timing.Idle();
-    } else {
-        if (tight_loop) {
-            physical_core.Run();
-        } else {
-            physical_core.Step();
-        }
-    }
-    core_timing.Advance();
-
-    Reschedule();
-}
-
-void CoreManager::SingleStep() {
-    return RunLoop(false);
-}
-
-void CoreManager::PrepareReschedule() {
-    physical_core.Stop();
-}
-
-void CoreManager::Reschedule() {
-    // Lock the global kernel mutex when we manipulate the HLE state
-    std::lock_guard lock(HLE::g_hle_lock);
-
-    global_scheduler.SelectThread(core_index);
-
-    physical_core.Scheduler().TryDoContextSwitch();
-}
-
-} // namespace Core
--- a/src/core/core_manager.h
+++ b/src/core/core_manager.h
@@ -1,63 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include "common/common_types.h"
-
-namespace Kernel {
-class GlobalScheduler;
-class PhysicalCore;
-} // namespace Kernel
-
-namespace Core {
-class System;
-}
-
-namespace Core::Timing {
-class CoreTiming;
-}
-
-namespace Core::Memory {
-class Memory;
-}
-
-namespace Core {
-
-constexpr unsigned NUM_CPU_CORES{4};
-
-class CoreManager {
-public:
-    CoreManager(System& system, std::size_t core_index);
-    ~CoreManager();
-
-    void RunLoop(bool tight_loop = true);
-
-    void SingleStep();
-
-    void PrepareReschedule();
-
-    bool IsMainCore() const {
-        return core_index == 0;
-    }
-
-    std::size_t CoreIndex() const {
-        return core_index;
-    }
-
-private:
-    void Reschedule();
-
-    Kernel::GlobalScheduler& global_scheduler;
-    Kernel::PhysicalCore& physical_core;
-    Timing::CoreTiming& core_timing;
-
-    std::atomic<bool> reschedule_pending = false;
-    std::size_t core_index;
-};
-
-} // namespace Core
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -1,29 +1,27 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include "core/core_timing.h"
-
 #include <algorithm>
 #include <mutex>
 #include <string>
 #include <tuple>

 #include "common/assert.h"
-#include "common/thread.h"
+#include "common/microprofile.h"
+#include "core/core_timing.h"
 #include "core/core_timing_util.h"
-#include "core/hardware_properties.h"

 namespace Core::Timing {

-constexpr int MAX_SLICE_LENGTH = 10000;
+constexpr u64 MAX_SLICE_LENGTH = 4000;

 std::shared_ptr<EventType> CreateEvent(std::string name, TimedCallback&& callback) {
    return std::make_shared<EventType>(std::move(callback), std::move(name));
 }

 struct CoreTiming::Event {
-    s64 time;
+    u64 time;
    u64 fifo_order;
    u64 userdata;
    std::weak_ptr<EventType> type;
@@ -39,51 +37,90 @@ struct CoreTiming::Event {
    }
 };

-CoreTiming::CoreTiming() = default;
+CoreTiming::CoreTiming() {
+    clock =
+        Common::CreateBestMatchingClock(Core::Hardware::BASE_CLOCK_RATE, Core::Hardware::CNTFREQ);
+}
+
 CoreTiming::~CoreTiming() = default;

-void CoreTiming::Initialize() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    slice_length = MAX_SLICE_LENGTH;
-    global_timer = 0;
-    idled_cycles = 0;
-    current_context = 0;
-
-    // The time between CoreTiming being initialized and the first call to Advance() is considered
-    // the slice boundary between slice -1 and slice 0. Dispatcher loops must call Advance() before
-    // executing the first cycle of each slice to prepare the slice length and downcount for
-    // that slice.
-    is_global_timer_sane = true;
+void CoreTiming::ThreadEntry(CoreTiming& instance) {
+    constexpr char name[] = "yuzu:HostTiming";
+    MicroProfileOnThreadCreate(name);
+    Common::SetCurrentThreadName(name);
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::VeryHigh);
+    instance.on_thread_init();
+    instance.ThreadLoop();
+}

+void CoreTiming::Initialize(std::function<void(void)>&& on_thread_init_) {
+    on_thread_init = std::move(on_thread_init_);
    event_fifo_id = 0;
-
+    shutting_down = false;
+    ticks = 0;
    const auto empty_timed_callback = [](u64, s64) {};
    ev_lost = CreateEvent("_lost_event", empty_timed_callback);
+    if (is_multicore) {
+        timer_thread = std::make_unique<std::thread>(ThreadEntry, std::ref(*this));
+    }
 }

 void CoreTiming::Shutdown() {
+    paused = true;
+    shutting_down = true;
+    pause_event.Set();
+    event.Set();
+    if (timer_thread) {
+        timer_thread->join();
+    }
    ClearPendingEvents();
+    timer_thread.reset();
+    has_started = false;
 }

-void CoreTiming::ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
-                               u64 userdata) {
-    std::lock_guard guard{inner_mutex};
-    const s64 timeout = GetTicks() + cycles_into_future;
+void CoreTiming::Pause(bool is_paused) {
+    paused = is_paused;
+    pause_event.Set();
+}

-    // If this event needs to be scheduled before the next advance(), force one early
-    if (!is_global_timer_sane) {
-        ForceExceptionCheck(cycles_into_future);
+void CoreTiming::SyncPause(bool is_paused) {
+    if (is_paused == paused && paused_set == paused) {
+        return;
    }
+    Pause(is_paused);
+    if (timer_thread) {
+        if (!is_paused) {
+            pause_event.Set();
+        }
+        event.Set();
+        while (paused_set != is_paused)
+            ;
+    }
+}

-    event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
+bool CoreTiming::IsRunning() const {
+    return !paused_set;
+}

-    std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+bool CoreTiming::HasPendingEvents() const {
+    return !(wait_set && event_queue.empty());
+}
+
+void CoreTiming::ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
+                               u64 userdata) {
+    {
+        std::scoped_lock scope{basic_lock};
+        const u64 timeout = static_cast<u64>(GetGlobalTimeNs().count() + ns_into_future);
+
+        event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
+
+        std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+    }
+    event.Set();
 }

 void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata) {
-    std::lock_guard guard{inner_mutex};
-
+    std::scoped_lock scope{basic_lock};
    const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
        return e.type.lock().get() == event_type.get() && e.userdata == userdata;
    });
@@ -95,21 +132,39 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u
    }
 }

-u64 CoreTiming::GetTicks() const {
-    u64 ticks = static_cast<u64>(global_timer);
-    if (!is_global_timer_sane) {
-        ticks += accumulated_ticks;
+void CoreTiming::AddTicks(u64 ticks) {
+    this->ticks += ticks;
+    downcount -= ticks;
+}
+
+void CoreTiming::Idle() {
+    if (!event_queue.empty()) {
+        const u64 next_event_time = event_queue.front().time;
+        const u64 next_ticks = nsToCycles(std::chrono::nanoseconds(next_event_time)) + 10U;
+        if (next_ticks > ticks) {
+            ticks = next_ticks;
+        }
+        return;
+    }
+    ticks += 1000U;
+}
+
+void CoreTiming::ResetTicks() {
+    downcount = MAX_SLICE_LENGTH;
+}
+
+u64 CoreTiming::GetCPUTicks() const {
+    if (is_multicore) {
+        return clock->GetCPUCycles();
    }
    return ticks;
 }

-u64 CoreTiming::GetIdleTicks() const {
-    return static_cast<u64>(idled_cycles);
-}
-
-void CoreTiming::AddTicks(u64 ticks) {
-    accumulated_ticks += ticks;
-    downcounts[current_context] -= static_cast<s64>(ticks);
+u64 CoreTiming::GetClockTicks() const {
+    if (is_multicore) {
+        return clock->GetClockCycles();
+    }
+    return CpuCyclesToClockCycles(ticks);
 }

 void CoreTiming::ClearPendingEvents() {
@@ -117,7 +172,7 @@ void CoreTiming::ClearPendingEvents() {
 }

 void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
-    std::lock_guard guard{inner_mutex};
+    std::scoped_lock lock{basic_lock};

    const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
        return e.type.lock().get() == event_type.get();
@@ -130,97 +185,68 @@ void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
    }
 }

-void CoreTiming::ForceExceptionCheck(s64 cycles) {
-    cycles = std::max<s64>(0, cycles);
-    if (downcounts[current_context] <= cycles) {
-        return;
-    }
-
-    // downcount is always (much) smaller than MAX_INT so we can safely cast cycles to an int
-    // here. Account for cycles already executed by adjusting the g.slice_length
-    downcounts[current_context] = static_cast<int>(cycles);
-}
-
-std::optional<u64> CoreTiming::NextAvailableCore(const s64 needed_ticks) const {
-    const u64 original_context = current_context;
-    u64 next_context = (original_context + 1) % num_cpu_cores;
-    while (next_context != original_context) {
-        if (time_slice[next_context] >= needed_ticks) {
-            return {next_context};
-        } else if (time_slice[next_context] >= 0) {
-            return std::nullopt;
-        }
-        next_context = (next_context + 1) % num_cpu_cores;
-    }
-    return std::nullopt;
-}
-
-void CoreTiming::Advance() {
-    std::unique_lock<std::mutex> guard(inner_mutex);
-
-    const u64 cycles_executed = accumulated_ticks;
-    time_slice[current_context] = std::max<s64>(0, time_slice[current_context] - accumulated_ticks);
-    global_timer += cycles_executed;
-
-    is_global_timer_sane = true;
+std::optional<s64> CoreTiming::Advance() {
+    std::scoped_lock lock{advance_lock, basic_lock};
+    global_timer = GetGlobalTimeNs().count();

    while (!event_queue.empty() && event_queue.front().time <= global_timer) {
        Event evt = std::move(event_queue.front());
        std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
        event_queue.pop_back();
-        inner_mutex.unlock();
+        basic_lock.unlock();

        if (auto event_type{evt.type.lock()}) {
            event_type->callback(evt.userdata, global_timer - evt.time);
        }

-        inner_mutex.lock();
+        basic_lock.lock();
+        global_timer = GetGlobalTimeNs().count();
    }

-    is_global_timer_sane = false;
-
-    // Still events left (scheduled in the future)
    if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        const auto next_core = NextAvailableCore(needed_ticks);
-        if (next_core) {
-            downcounts[*next_core] = needed_ticks;
+        const s64 next_time = event_queue.front().time - global_timer;
+        return next_time;
+    } else {
+        return std::nullopt;
+    }
+}
+
+void CoreTiming::ThreadLoop() {
+    has_started = true;
+    while (!shutting_down) {
+        while (!paused) {
+            paused_set = false;
+            const auto next_time = Advance();
+            if (next_time) {
+                if (*next_time > 0) {
+                    std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time);
+                    event.WaitFor(next_time_ns);
+                }
+            } else {
+                wait_set = true;
+                event.Wait();
+            }
+            wait_set = false;
        }
+        paused_set = true;
+        clock->Pause(true);
+        pause_event.Wait();
+        clock->Pause(false);
    }
-
-    accumulated_ticks = 0;
-
-    downcounts[current_context] = time_slice[current_context];
 }

-void CoreTiming::ResetRun() {
-    downcounts.fill(MAX_SLICE_LENGTH);
-    time_slice.fill(MAX_SLICE_LENGTH);
-    current_context = 0;
-    // Still events left (scheduled in the future)
-    if (!event_queue.empty()) {
-        const s64 needed_ticks =
-            std::min<s64>(event_queue.front().time - global_timer, MAX_SLICE_LENGTH);
-        downcounts[current_context] = needed_ticks;
+std::chrono::nanoseconds CoreTiming::GetGlobalTimeNs() const {
+    if (is_multicore) {
+        return clock->GetTimeNS();
    }
-
-    is_global_timer_sane = false;
-    accumulated_ticks = 0;
-}
-
-void CoreTiming::Idle() {
-    accumulated_ticks += downcounts[current_context];
-    idled_cycles += downcounts[current_context];
-    downcounts[current_context] = 0;
+    return CyclesToNs(ticks);
 }

 std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const {
-    return std::chrono::microseconds{GetTicks() * 1000000 / Hardware::BASE_CLOCK_RATE};
-}
-
-s64 CoreTiming::GetDowncount() const {
-    return downcounts[current_context];
+    if (is_multicore) {
+        return clock->GetTimeUS();
+    }
+    return CyclesToUs(ticks);
 }

 } // namespace Core::Timing
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -1,19 +1,25 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

 #pragma once

+#include <atomic>
 #include <chrono>
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <optional>
 #include <string>
+#include <thread>
 #include <vector>

 #include "common/common_types.h"
+#include "common/spin_lock.h"
+#include "common/thread.h"
 #include "common/threadsafe_queue.h"
+#include "common/wall_clock.h"
+#include "core/hardware_properties.h"

 namespace Core::Timing {

@@ -56,16 +62,40 @@ public:

    /// CoreTiming begins at the boundary of timing slice -1. An initial call to Advance() is
    /// required to end slice - 1 and start slice 0 before the first cycle of code is executed.
-    void Initialize();
+    void Initialize(std::function<void(void)>&& on_thread_init_);

    /// Tears down all timing related functionality.
    void Shutdown();

-    /// After the first Advance, the slice lengths and the downcount will be reduced whenever an
-    /// event is scheduled earlier than the current values.
-    ///
-    /// Scheduling from a callback will not update the downcount until the Advance() completes.
-    void ScheduleEvent(s64 cycles_into_future, const std::shared_ptr<EventType>& event_type,
+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
+    /// Check if it's using host timing.
+    bool IsHostTiming() const {
+        return is_multicore;
+    }
+
+    /// Pauses/Unpauses the execution of the timer thread.
+    void Pause(bool is_paused);
+
+    /// Pauses/Unpauses the execution of the timer thread and waits until paused.
+    void SyncPause(bool is_paused);
+
+    /// Checks if core timing is running.
+    bool IsRunning() const;
+
+    /// Checks if the timer thread has started.
+    bool HasStarted() const {
+        return has_started;
+    }
+
+    /// Checks if there are any pending time events.
+    bool HasPendingEvents() const;
+
+    /// Schedules an event in core timing
+    void ScheduleEvent(s64 ns_into_future, const std::shared_ptr<EventType>& event_type,
                       u64 userdata = 0);

    void UnscheduleEvent(const std::shared_ptr<EventType>& event_type, u64 userdata);
@@ -73,41 +103,30 @@ public:
    /// We only permit one event of each type in the queue at a time.
    void RemoveEvent(const std::shared_ptr<EventType>& event_type);

-    void ForceExceptionCheck(s64 cycles);
-
-    /// This should only be called from the emu thread, if you are calling it any other thread,
-    /// you are doing something evil
-    u64 GetTicks() const;
-
-    u64 GetIdleTicks() const;
-
    void AddTicks(u64 ticks);

-    /// Advance must be called at the beginning of dispatcher loops, not the end. Advance() ends
-    /// the previous timing slice and begins the next one, you must Advance from the previous
-    /// slice to the current one before executing any cycles. CoreTiming starts in slice -1 so an
-    /// Advance() is required to initialize the slice length before the first cycle of emulated
-    /// instructions is executed.
-    void Advance();
+    void ResetTicks();

-    /// Pretend that the main CPU has executed enough cycles to reach the next event.
    void Idle();

+    s64 GetDowncount() const {
+        return downcount;
+    }
+
+    /// Returns current time in emulated CPU cycles
+    u64 GetCPUTicks() const;
+
+    /// Returns current time in emulated in Clock cycles
+    u64 GetClockTicks() const;
+
+    /// Returns current time in microseconds.
    std::chrono::microseconds GetGlobalTimeUs() const;

-    void ResetRun();
+    /// Returns current time in nanoseconds.
+    std::chrono::nanoseconds GetGlobalTimeNs() const;

-    s64 GetDowncount() const;
-
-    void SwitchContext(u64 new_context) {
-        current_context = new_context;
-    }
-
-    bool CanCurrentContextRun() const {
-        return time_slice[current_context] > 0;
-    }
-
-    std::optional<u64> NextAvailableCore(const s64 needed_ticks) const;
+    /// Checks for events manually and returns time in nanoseconds for next event, threadsafe.
+    std::optional<s64> Advance();

 private:
    struct Event;
@@ -115,21 +134,14 @@ private:
    /// Clear all pending events. This should ONLY be done on exit.
    void ClearPendingEvents();

-    static constexpr u64 num_cpu_cores = 4;
+    static void ThreadEntry(CoreTiming& instance);
+    void ThreadLoop();

-    s64 global_timer = 0;
-    s64 idled_cycles = 0;
-    s64 slice_length = 0;
-    u64 accumulated_ticks = 0;
-    std::array<s64, num_cpu_cores> downcounts{};
-    // Slice of time assigned to each core per run.
-    std::array<s64, num_cpu_cores> time_slice{};
-    u64 current_context = 0;
+    std::unique_ptr<Common::WallClock> clock;

-    // Are we in a function that has been called from Advance()
-    // If events are scheduled from a function that gets called from Advance(),
-    // don't change slice_length and downcount.
-    bool is_global_timer_sane = false;
+    u64 global_timer = 0;
+
+    std::chrono::nanoseconds start_point;

    // The queue is a min-heap using std::make_heap/push_heap/pop_heap.
    // We don't use std::priority_queue because we need to be able to serialize, unserialize and
@@ -139,8 +151,23 @@ private:
    u64 event_fifo_id = 0;

    std::shared_ptr<EventType> ev_lost;
+    Common::Event event{};
+    Common::Event pause_event{};
+    Common::SpinLock basic_lock{};
+    Common::SpinLock advance_lock{};
+    std::unique_ptr<std::thread> timer_thread;
+    std::atomic<bool> paused{};
+    std::atomic<bool> paused_set{};
+    std::atomic<bool> wait_set{};
+    std::atomic<bool> shutting_down{};
+    std::atomic<bool> has_started{};
+    std::function<void(void)> on_thread_init{};

-    std::mutex inner_mutex;
+    bool is_multicore{};
+
+    /// Cycle timing
+    u64 ticks{};
+    s64 downcount{};
 };

 /// Creates a core timing event with the given name and callback.
--- a/src/core/core_timing_util.cpp
+++ b/src/core/core_timing_util.cpp
@@ -38,15 +38,23 @@ s64 usToCycles(std::chrono::microseconds us) {
 }

 s64 nsToCycles(std::chrono::nanoseconds ns) {
-    if (static_cast<u64>(ns.count() / 1000000000) > MAX_VALUE_TO_MULTIPLY) {
-        LOG_ERROR(Core_Timing, "Integer overflow, use max value");
-        return std::numeric_limits<s64>::max();
-    }
-    if (static_cast<u64>(ns.count()) > MAX_VALUE_TO_MULTIPLY) {
-        LOG_DEBUG(Core_Timing, "Time very big, do rounding");
-        return Hardware::BASE_CLOCK_RATE * (ns.count() / 1000000000);
-    }
-    return (Hardware::BASE_CLOCK_RATE * ns.count()) / 1000000000;
+    const u128 temporal = Common::Multiply64Into128(ns.count(), Hardware::BASE_CLOCK_RATE);
+    return Common::Divide128On32(temporal, static_cast<u32>(1000000000)).first;
+}
+
+u64 msToClockCycles(std::chrono::milliseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000).first;
+}
+
+u64 usToClockCycles(std::chrono::microseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000000).first;
+}
+
+u64 nsToClockCycles(std::chrono::nanoseconds ns) {
+    const u128 temp = Common::Multiply64Into128(ns.count(), Hardware::CNTFREQ);
+    return Common::Divide128On32(temp, 1000000000).first;
 }

 u64 CpuCyclesToClockCycles(u64 ticks) {
@@ -54,4 +62,22 @@ u64 CpuCyclesToClockCycles(u64 ticks) {
    return Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
 }

+std::chrono::milliseconds CyclesToMs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000);
+    u64 ms = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::milliseconds(ms);
+}
+
+std::chrono::nanoseconds CyclesToNs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000000000);
+    u64 ns = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::nanoseconds(ns);
+}
+
+std::chrono::microseconds CyclesToUs(s64 cycles) {
+    const u128 temporal = Common::Multiply64Into128(cycles, 1000000);
+    u64 us = Common::Divide128On32(temporal, static_cast<u32>(Hardware::BASE_CLOCK_RATE)).first;
+    return std::chrono::microseconds(us);
+}
+
 } // namespace Core::Timing
--- a/src/core/core_timing_util.h
+++ b/src/core/core_timing_util.h
@@ -13,18 +13,12 @@ namespace Core::Timing {
 s64 msToCycles(std::chrono::milliseconds ms);
 s64 usToCycles(std::chrono::microseconds us);
 s64 nsToCycles(std::chrono::nanoseconds ns);
-
-inline std::chrono::milliseconds CyclesToMs(s64 cycles) {
-    return std::chrono::milliseconds(cycles * 1000 / Hardware::BASE_CLOCK_RATE);
-}
-
-inline std::chrono::nanoseconds CyclesToNs(s64 cycles) {
-    return std::chrono::nanoseconds(cycles * 1000000000 / Hardware::BASE_CLOCK_RATE);
-}
-
-inline std::chrono::microseconds CyclesToUs(s64 cycles) {
-    return std::chrono::microseconds(cycles * 1000000 / Hardware::BASE_CLOCK_RATE);
-}
+u64 msToClockCycles(std::chrono::milliseconds ns);
+u64 usToClockCycles(std::chrono::microseconds ns);
+u64 nsToClockCycles(std::chrono::nanoseconds ns);
+std::chrono::milliseconds CyclesToMs(s64 cycles);
+std::chrono::nanoseconds CyclesToNs(s64 cycles);
+std::chrono::microseconds CyclesToUs(s64 cycles);

 u64 CpuCyclesToClockCycles(u64 ticks);

--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -2,80 +2,372 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/fiber.h"
+#include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/core_timing.h"
 #include "core/cpu_manager.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/physical_core.h"
+#include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/thread.h"
+#include "video_core/gpu.h"

 namespace Core {

 CpuManager::CpuManager(System& system) : system{system} {}
 CpuManager::~CpuManager() = default;

+void CpuManager::ThreadStart(CpuManager& cpu_manager, std::size_t core) {
+    cpu_manager.RunThread(core);
+}
+
 void CpuManager::Initialize() {
-    for (std::size_t index = 0; index < core_managers.size(); ++index) {
-        core_managers[index] = std::make_unique<CoreManager>(system, index);
+    running_mode = true;
+    if (is_multicore) {
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].host_thread =
+                std::make_unique<std::thread>(ThreadStart, std::ref(*this), core);
+        }
+    } else {
+        core_data[0].host_thread = std::make_unique<std::thread>(ThreadStart, std::ref(*this), 0);
    }
 }

 void CpuManager::Shutdown() {
-    for (auto& cpu_core : core_managers) {
-        cpu_core.reset();
+    running_mode = false;
+    Pause(false);
+    if (is_multicore) {
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].host_thread->join();
+            core_data[core].host_thread.reset();
+        }
+    } else {
+        core_data[0].host_thread->join();
+        core_data[0].host_thread.reset();
    }
 }

-CoreManager& CpuManager::GetCoreManager(std::size_t index) {
-    return *core_managers.at(index);
+std::function<void(void*)> CpuManager::GetGuestThreadStartFunc() {
+    return std::function<void(void*)>(GuestThreadFunction);
 }

-const CoreManager& CpuManager::GetCoreManager(std::size_t index) const {
-    return *core_managers.at(index);
+std::function<void(void*)> CpuManager::GetIdleThreadStartFunc() {
+    return std::function<void(void*)>(IdleThreadFunction);
 }

-CoreManager& CpuManager::GetCurrentCoreManager() {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+std::function<void(void*)> CpuManager::GetSuspendThreadStartFunc() {
+    return std::function<void(void*)>(SuspendThreadFunction);
 }

-const CoreManager& CpuManager::GetCurrentCoreManager() const {
-    // Otherwise, use single-threaded mode active_core variable
-    return *core_managers[active_core];
+void CpuManager::GuestThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunGuestThread();
+    } else {
+        cpu_manager->SingleCoreRunGuestThread();
+    }
 }

-void CpuManager::RunLoop(bool tight_loop) {
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::HandlePacket();
+void CpuManager::GuestRewindFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunGuestLoop();
+    } else {
+        cpu_manager->SingleCoreRunGuestLoop();
+    }
+}

-        // If the loop is halted and we want to step, use a tiny (1) number of instructions to
-        // execute. Otherwise, get out of the loop function.
-        if (GDBStub::GetCpuHaltFlag()) {
-            if (GDBStub::GetCpuStepFlag()) {
-                tight_loop = false;
-            } else {
-                return;
+void CpuManager::IdleThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunIdleThread();
+    } else {
+        cpu_manager->SingleCoreRunIdleThread();
+    }
+}
+
+void CpuManager::SuspendThreadFunction(void* cpu_manager_) {
+    CpuManager* cpu_manager = static_cast<CpuManager*>(cpu_manager_);
+    if (cpu_manager->is_multicore) {
+        cpu_manager->MultiCoreRunSuspendThread();
+    } else {
+        cpu_manager->SingleCoreRunSuspendThread();
+    }
+}
+
+void* CpuManager::GetStartFuncParamater() {
+    return static_cast<void*>(this);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///                             MultiCore                                   ///
+///////////////////////////////////////////////////////////////////////////////
+
+void CpuManager::MultiCoreRunGuestThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    MultiCoreRunGuestLoop();
+}
+
+void CpuManager::MultiCoreRunGuestLoop() {
+    auto& kernel = system.Kernel();
+    auto* thread = kernel.CurrentScheduler().GetCurrentThread();
+    while (true) {
+        auto* physical_core = &kernel.CurrentPhysicalCore();
+        auto& arm_interface = thread->ArmInterface();
+        system.EnterDynarmicProfile();
+        while (!physical_core->IsInterrupted()) {
+            arm_interface.Run();
+            physical_core = &kernel.CurrentPhysicalCore();
+        }
+        system.ExitDynarmicProfile();
+        arm_interface.ClearExclusiveState();
+        auto& scheduler = kernel.CurrentScheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCoreRunIdleThread() {
+    auto& kernel = system.Kernel();
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        physical_core.Idle();
+        auto& scheduler = kernel.CurrentScheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCoreRunSuspendThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto core = kernel.GetCurrentHostThreadID();
+        auto& scheduler = kernel.CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        Common::Fiber::YieldTo(current_thread->GetHostContext(), core_data[core].host_context);
+        ASSERT(scheduler.ContextSwitchPending());
+        ASSERT(core == kernel.GetCurrentHostThreadID());
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::MultiCorePause(bool paused) {
+    if (!paused) {
+        bool all_not_barrier = false;
+        while (!all_not_barrier) {
+            all_not_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_not_barrier &=
+                    !core_data[core].is_running.load() && core_data[core].initialized.load();
            }
        }
-    }
-
-    auto& core_timing = system.CoreTiming();
-    core_timing.ResetRun();
-    bool keep_running{};
-    do {
-        keep_running = false;
-        for (active_core = 0; active_core < NUM_CPU_CORES; ++active_core) {
-            core_timing.SwitchContext(active_core);
-            if (core_timing.CanCurrentContextRun()) {
-                core_managers[active_core]->RunLoop(tight_loop);
-            }
-            keep_running |= core_timing.CanCurrentContextRun();
+        for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+            core_data[core].enter_barrier->Set();
        }
-    } while (keep_running);
-
-    if (GDBStub::IsServerEnabled()) {
-        GDBStub::SetCpuStepFlag(false);
+        if (paused_state.load()) {
+            bool all_barrier = false;
+            while (!all_barrier) {
+                all_barrier = true;
+                for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                    all_barrier &=
+                        core_data[core].is_paused.load() && core_data[core].initialized.load();
+                }
+            }
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                core_data[core].exit_barrier->Set();
+            }
+        }
+    } else {
+        /// Wait until all cores are paused.
+        bool all_barrier = false;
+        while (!all_barrier) {
+            all_barrier = true;
+            for (std::size_t core = 0; core < Core::Hardware::NUM_CPU_CORES; core++) {
+                all_barrier &=
+                    core_data[core].is_paused.load() && core_data[core].initialized.load();
+            }
+        }
+        /// Don't release the barrier
    }
+    paused_state = paused;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///                             SingleCore                                   ///
+///////////////////////////////////////////////////////////////////////////////
+
+void CpuManager::SingleCoreRunGuestThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    SingleCoreRunGuestLoop();
+}
+
+void CpuManager::SingleCoreRunGuestLoop() {
+    auto& kernel = system.Kernel();
+    auto* thread = kernel.CurrentScheduler().GetCurrentThread();
+    while (true) {
+        auto* physical_core = &kernel.CurrentPhysicalCore();
+        auto& arm_interface = thread->ArmInterface();
+        system.EnterDynarmicProfile();
+        if (!physical_core->IsInterrupted()) {
+            arm_interface.Run();
+            physical_core = &kernel.CurrentPhysicalCore();
+        }
+        system.ExitDynarmicProfile();
+        thread->SetPhantomMode(true);
+        system.CoreTiming().Advance();
+        thread->SetPhantomMode(false);
+        arm_interface.ClearExclusiveState();
+        PreemptSingleCore();
+        auto& scheduler = kernel.Scheduler(current_core);
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::SingleCoreRunIdleThread() {
+    auto& kernel = system.Kernel();
+    while (true) {
+        auto& physical_core = kernel.CurrentPhysicalCore();
+        PreemptSingleCore(false);
+        system.CoreTiming().AddTicks(1000U);
+        idle_count++;
+        auto& scheduler = physical_core.Scheduler();
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::SingleCoreRunSuspendThread() {
+    auto& kernel = system.Kernel();
+    {
+        auto& sched = kernel.CurrentScheduler();
+        sched.OnThreadStart();
+    }
+    while (true) {
+        auto core = kernel.GetCurrentHostThreadID();
+        auto& scheduler = kernel.CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        Common::Fiber::YieldTo(current_thread->GetHostContext(), core_data[0].host_context);
+        ASSERT(scheduler.ContextSwitchPending());
+        ASSERT(core == kernel.GetCurrentHostThreadID());
+        scheduler.TryDoContextSwitch();
+    }
+}
+
+void CpuManager::PreemptSingleCore(bool from_running_enviroment) {
+    std::size_t old_core = current_core;
+    auto& scheduler = system.Kernel().Scheduler(old_core);
+    Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+    if (idle_count >= 4 || from_running_enviroment) {
+        if (!from_running_enviroment) {
+            system.CoreTiming().Idle();
+            idle_count = 0;
+        }
+        current_thread->SetPhantomMode(true);
+        system.CoreTiming().Advance();
+        current_thread->SetPhantomMode(false);
+    }
+    current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES);
+    system.CoreTiming().ResetTicks();
+    scheduler.Unload();
+    auto& next_scheduler = system.Kernel().Scheduler(current_core);
+    Common::Fiber::YieldTo(current_thread->GetHostContext(), next_scheduler.ControlContext());
+    /// May have changed scheduler
+    auto& current_scheduler = system.Kernel().Scheduler(current_core);
+    current_scheduler.Reload();
+    auto* currrent_thread2 = current_scheduler.GetCurrentThread();
+    if (!currrent_thread2->IsIdleThread()) {
+        idle_count = 0;
+    }
+}
+
+void CpuManager::SingleCorePause(bool paused) {
+    if (!paused) {
+        bool all_not_barrier = false;
+        while (!all_not_barrier) {
+            all_not_barrier = !core_data[0].is_running.load() && core_data[0].initialized.load();
+        }
+        core_data[0].enter_barrier->Set();
+        if (paused_state.load()) {
+            bool all_barrier = false;
+            while (!all_barrier) {
+                all_barrier = core_data[0].is_paused.load() && core_data[0].initialized.load();
+            }
+            core_data[0].exit_barrier->Set();
+        }
+    } else {
+        /// Wait until all cores are paused.
+        bool all_barrier = false;
+        while (!all_barrier) {
+            all_barrier = core_data[0].is_paused.load() && core_data[0].initialized.load();
+        }
+        /// Don't release the barrier
+    }
+    paused_state = paused;
+}
+
+void CpuManager::Pause(bool paused) {
+    if (is_multicore) {
+        MultiCorePause(paused);
+    } else {
+        SingleCorePause(paused);
+    }
+}
+
+void CpuManager::RunThread(std::size_t core) {
+    /// Initialization
+    system.RegisterCoreThread(core);
+    std::string name;
+    if (is_multicore) {
+        name = "yuzu:CoreCPUThread_" + std::to_string(core);
+    } else {
+        name = "yuzu:CPUThread";
+    }
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    auto& data = core_data[core];
+    data.enter_barrier = std::make_unique<Common::Event>();
+    data.exit_barrier = std::make_unique<Common::Event>();
+    data.host_context = Common::Fiber::ThreadToFiber();
+    data.is_running = false;
+    data.initialized = true;
+    const bool sc_sync = !is_async_gpu && !is_multicore;
+    bool sc_sync_first_use = sc_sync;
+    /// Running
+    while (running_mode) {
+        data.is_running = false;
+        data.enter_barrier->Wait();
+        if (sc_sync_first_use) {
+            system.GPU().ObtainContext();
+            sc_sync_first_use = false;
+        }
+        auto& scheduler = system.Kernel().CurrentScheduler();
+        Kernel::Thread* current_thread = scheduler.GetCurrentThread();
+        data.is_running = true;
+        Common::Fiber::YieldTo(data.host_context, current_thread->GetHostContext());
+        data.is_running = false;
+        data.is_paused = true;
+        data.exit_barrier->Wait();
+        data.is_paused = false;
+    }
+    /// Time to cleanup
+    data.host_context->Exit();
+    data.enter_barrier.reset();
+    data.exit_barrier.reset();
+    data.initialized = false;
 }

 } // namespace Core
--- a/src/core/cpu_manager.h
+++ b/src/core/cpu_manager.h
@@ -5,12 +5,19 @@
 #pragma once

 #include <array>
+#include <atomic>
+#include <functional>
 #include <memory>
+#include <thread>
 #include "core/hardware_properties.h"

+namespace Common {
+class Event;
+class Fiber;
+} // namespace Common
+
 namespace Core {

-class CoreManager;
 class System;

 class CpuManager {
@@ -24,24 +31,75 @@ public:
    CpuManager& operator=(const CpuManager&) = delete;
    CpuManager& operator=(CpuManager&&) = delete;

+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
+    /// Sets if emulation is using an asynchronous GPU.
+    void SetAsyncGpu(bool is_async_gpu) {
+        this->is_async_gpu = is_async_gpu;
+    }
+
    void Initialize();
    void Shutdown();

-    CoreManager& GetCoreManager(std::size_t index);
-    const CoreManager& GetCoreManager(std::size_t index) const;
+    void Pause(bool paused);

-    CoreManager& GetCurrentCoreManager();
-    const CoreManager& GetCurrentCoreManager() const;
+    std::function<void(void*)> GetGuestThreadStartFunc();
+    std::function<void(void*)> GetIdleThreadStartFunc();
+    std::function<void(void*)> GetSuspendThreadStartFunc();
+    void* GetStartFuncParamater();

-    std::size_t GetActiveCoreIndex() const {
-        return active_core;
+    void PreemptSingleCore(bool from_running_enviroment = true);
+
+    std::size_t CurrentCore() const {
+        return current_core.load();
    }

-    void RunLoop(bool tight_loop);
-
 private:
-    std::array<std::unique_ptr<CoreManager>, Hardware::NUM_CPU_CORES> core_managers;
-    std::size_t active_core{}; ///< Active core, only used in single thread mode
+    static void GuestThreadFunction(void* cpu_manager);
+    static void GuestRewindFunction(void* cpu_manager);
+    static void IdleThreadFunction(void* cpu_manager);
+    static void SuspendThreadFunction(void* cpu_manager);
+
+    void MultiCoreRunGuestThread();
+    void MultiCoreRunGuestLoop();
+    void MultiCoreRunIdleThread();
+    void MultiCoreRunSuspendThread();
+    void MultiCorePause(bool paused);
+
+    void SingleCoreRunGuestThread();
+    void SingleCoreRunGuestLoop();
+    void SingleCoreRunIdleThread();
+    void SingleCoreRunSuspendThread();
+    void SingleCorePause(bool paused);
+
+    static void ThreadStart(CpuManager& cpu_manager, std::size_t core);
+
+    void RunThread(std::size_t core);
+
+    struct CoreData {
+        std::shared_ptr<Common::Fiber> host_context;
+        std::unique_ptr<Common::Event> enter_barrier;
+        std::unique_ptr<Common::Event> exit_barrier;
+        std::atomic<bool> is_running;
+        std::atomic<bool> is_paused;
+        std::atomic<bool> initialized;
+        std::unique_ptr<std::thread> host_thread;
+    };
+
+    std::atomic<bool> running_mode{};
+    std::atomic<bool> paused_state{};
+
+    std::array<CoreData, Core::Hardware::NUM_CPU_CORES> core_data{};
+
+    bool is_async_gpu{};
+    bool is_multicore{};
+    std::atomic<std::size_t> current_core{};
+    std::size_t preemption_count{};
+    std::size_t idle_count{};
+    static constexpr std::size_t max_cycle_runs = 5;

    System& system;
 };
--- a/src/core/crypto/key_manager.cpp
+++ b/src/core/crypto/key_manager.cpp
@@ -695,8 +695,9 @@ void KeyManager::WriteKeyToFile(KeyCategory category, std::string_view keyname,
 }

 void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
-    if (s128_keys.find({id, field1, field2}) != s128_keys.end())
+    if (s128_keys.find({id, field1, field2}) != s128_keys.end() || key == Key128{}) {
        return;
+    }
    if (id == S128KeyType::Titlekey) {
        Key128 rights_id;
        std::memcpy(rights_id.data(), &field2, sizeof(u64));
@@ -716,8 +717,9 @@ void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
            return std::tie(elem.second.type, elem.second.field1, elem.second.field2) ==
                   std::tie(id, field1, field2);
        });
-    if (iter2 != s128_file_id.end())
+    if (iter2 != s128_file_id.end()) {
        WriteKeyToFile(category, iter2->first, key);
+    }

    // Variable cases
    if (id == S128KeyType::KeyArea) {
@@ -745,16 +747,18 @@ void KeyManager::SetKey(S128KeyType id, Key128 key, u64 field1, u64 field2) {
 }

 void KeyManager::SetKey(S256KeyType id, Key256 key, u64 field1, u64 field2) {
-    if (s256_keys.find({id, field1, field2}) != s256_keys.end())
+    if (s256_keys.find({id, field1, field2}) != s256_keys.end() || key == Key256{}) {
        return;
+    }
    const auto iter = std::find_if(
        s256_file_id.begin(), s256_file_id.end(),
        [&id, &field1, &field2](const std::pair<std::string, KeyIndex<S256KeyType>> elem) {
            return std::tie(elem.second.type, elem.second.field1, elem.second.field2) ==
                   std::tie(id, field1, field2);
        });
-    if (iter != s256_file_id.end())
+    if (iter != s256_file_id.end()) {
        WriteKeyToFile(KeyCategory::Standard, iter->first, key);
+    }
    s256_keys[{id, field1, field2}] = key;
 }

--- a/src/core/crypto/key_manager.h
+++ b/src/core/crypto/key_manager.h
@@ -223,7 +223,16 @@ bool operator<(const KeyIndex<KeyType>& lhs, const KeyIndex<KeyType>& rhs) {

 class KeyManager {
 public:
-    KeyManager();
+    static KeyManager& Instance() {
+        static KeyManager instance;
+        return instance;
+    }
+
+    KeyManager(const KeyManager&) = delete;
+    KeyManager& operator=(const KeyManager&) = delete;
+
+    KeyManager(KeyManager&&) = delete;
+    KeyManager& operator=(KeyManager&&) = delete;

    bool HasKey(S128KeyType id, u64 field1 = 0, u64 field2 = 0) const;
    bool HasKey(S256KeyType id, u64 field1 = 0, u64 field2 = 0) const;
@@ -257,6 +266,8 @@ public:
    bool AddTicketPersonalized(Ticket raw);

 private:
+    KeyManager();
+
    std::map<KeyIndex<S128KeyType>, Key128> s128_keys;
    std::map<KeyIndex<S256KeyType>, Key256> s256_keys;

--- a/src/core/file_sys/bis_factory.cpp
+++ b/src/core/file_sys/bis_factory.cpp
@@ -12,6 +12,10 @@

 namespace FileSys {

+constexpr u64 NAND_USER_SIZE = 0x680000000;  // 26624 MiB
+constexpr u64 NAND_SYSTEM_SIZE = 0xA0000000; // 2560 MiB
+constexpr u64 NAND_TOTAL_SIZE = 0x747C00000; // 29820 MiB
+
 BISFactory::BISFactory(VirtualDir nand_root_, VirtualDir load_root_, VirtualDir dump_root_)
    : nand_root(std::move(nand_root_)), load_root(std::move(load_root_)),
      dump_root(std::move(dump_root_)),
@@ -79,7 +83,7 @@ VirtualDir BISFactory::OpenPartition(BisPartitionId id) const {
 }

 VirtualFile BISFactory::OpenPartitionStorage(BisPartitionId id) const {
-    Core::Crypto::KeyManager keys;
+    auto& keys = Core::Crypto::KeyManager::Instance();
    Core::Crypto::PartitionDataManager pdm{
        Core::System::GetInstance().GetFilesystem()->OpenDirectory(
            FileUtil::GetUserPath(FileUtil::UserPath::SysDataDir), Mode::Read)};
@@ -110,30 +114,29 @@ VirtualDir BISFactory::GetImageDirectory() const {

 u64 BISFactory::GetSystemNANDFreeSpace() const {
    const auto sys_dir = GetOrCreateDirectoryRelative(nand_root, "/system");
-    if (sys_dir == nullptr)
-        return 0;
+    if (sys_dir == nullptr) {
+        return GetSystemNANDTotalSpace();
+    }

    return GetSystemNANDTotalSpace() - sys_dir->GetSize();
 }

 u64 BISFactory::GetSystemNANDTotalSpace() const {
-    return static_cast<u64>(Settings::values.nand_system_size);
+    return NAND_SYSTEM_SIZE;
 }

 u64 BISFactory::GetUserNANDFreeSpace() const {
-    const auto usr_dir = GetOrCreateDirectoryRelative(nand_root, "/user");
-    if (usr_dir == nullptr)
-        return 0;
-
-    return GetUserNANDTotalSpace() - usr_dir->GetSize();
+    // For some reason games such as BioShock 1 checks whether this is exactly 0x680000000 bytes.
+    // Set the free space to be 1 MiB less than the total as a workaround to this issue.
+    return GetUserNANDTotalSpace() - 0x100000;
 }

 u64 BISFactory::GetUserNANDTotalSpace() const {
-    return static_cast<u64>(Settings::values.nand_user_size);
+    return NAND_USER_SIZE;
 }

 u64 BISFactory::GetFullNANDTotalSpace() const {
-    return static_cast<u64>(Settings::values.nand_total_size);
+    return NAND_TOTAL_SIZE;
 }

 VirtualDir BISFactory::GetBCATDirectory(u64 title_id) const {
--- a/src/core/file_sys/card_image.cpp
+++ b/src/core/file_sys/card_image.cpp
@@ -178,7 +178,7 @@ u32 XCI::GetSystemUpdateVersion() {
        return 0;

    for (const auto& file : update->GetFiles()) {
-        NCA nca{file, nullptr, 0, keys};
+        NCA nca{file, nullptr, 0};

        if (nca.GetStatus() != Loader::ResultStatus::Success)
            continue;
@@ -286,7 +286,7 @@ Loader::ResultStatus XCI::AddNCAFromPartition(XCIPartition part) {
            continue;
        }

-        auto nca = std::make_shared<NCA>(file, nullptr, 0, keys);
+        auto nca = std::make_shared<NCA>(file, nullptr, 0);
        if (nca->IsUpdate()) {
            continue;
        }
--- a/src/core/file_sys/card_image.h
+++ b/src/core/file_sys/card_image.h
@@ -140,6 +140,6 @@ private:

    u64 update_normal_partition_end;

-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 } // namespace FileSys
--- a/src/core/file_sys/content_archive.cpp
+++ b/src/core/file_sys/content_archive.cpp
@@ -118,9 +118,8 @@ static bool IsValidNCA(const NCAHeader& header) {
    return header.magic == Common::MakeMagic('N', 'C', 'A', '3');
 }

-NCA::NCA(VirtualFile file_, VirtualFile bktr_base_romfs_, u64 bktr_base_ivfc_offset,
-         Core::Crypto::KeyManager keys_)
-    : file(std::move(file_)), bktr_base_romfs(std::move(bktr_base_romfs_)), keys(std::move(keys_)) {
+NCA::NCA(VirtualFile file_, VirtualFile bktr_base_romfs_, u64 bktr_base_ivfc_offset)
+    : file(std::move(file_)), bktr_base_romfs(std::move(bktr_base_romfs_)) {
    if (file == nullptr) {
        status = Loader::ResultStatus::ErrorNullFile;
        return;
--- a/src/core/file_sys/content_archive.h
+++ b/src/core/file_sys/content_archive.h
@@ -99,8 +99,7 @@ inline bool IsDirectoryLogoPartition(const VirtualDir& pfs) {
 class NCA : public ReadOnlyVfsDirectory {
 public:
    explicit NCA(VirtualFile file, VirtualFile bktr_base_romfs = nullptr,
-                 u64 bktr_base_ivfc_offset = 0,
-                 Core::Crypto::KeyManager keys = Core::Crypto::KeyManager());
+                 u64 bktr_base_ivfc_offset = 0);
    ~NCA() override;

    Loader::ResultStatus GetStatus() const;
@@ -159,7 +158,7 @@ private:
    bool encrypted = false;
    bool is_update = false;

-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };

 } // namespace FileSys
--- a/src/core/file_sys/patch_manager.cpp
+++ b/src/core/file_sys/patch_manager.cpp
@@ -80,16 +80,6 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
    if (exefs == nullptr)
        return exefs;

-    if (Settings::values.dump_exefs) {
-        LOG_INFO(Loader, "Dumping ExeFS for title_id={:016X}", title_id);
-        const auto dump_dir =
-            Core::System::GetInstance().GetFileSystemController().GetModificationDumpRoot(title_id);
-        if (dump_dir != nullptr) {
-            const auto exefs_dir = GetOrCreateDirectoryRelative(dump_dir, "/exefs");
-            VfsRawCopyD(exefs, exefs_dir);
-        }
-    }
-
    const auto& installed = Core::System::GetInstance().GetContentProvider();

    const auto& disabled = Settings::values.disabled_addons[title_id];
@@ -135,6 +125,16 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
        }
    }

+    if (Settings::values.dump_exefs) {
+        LOG_INFO(Loader, "Dumping ExeFS for title_id={:016X}", title_id);
+        const auto dump_dir =
+                Core::System::GetInstance().GetFileSystemController().GetModificationDumpRoot(title_id);
+        if (dump_dir != nullptr) {
+            const auto exefs_dir = GetOrCreateDirectoryRelative(dump_dir, "/exefs");
+            VfsRawCopyD(exefs, exefs_dir);
+        }
+    }
+
    return exefs;
 }

--- a/src/core/file_sys/registered_cache.cpp
+++ b/src/core/file_sys/registered_cache.cpp
@@ -408,7 +408,7 @@ void RegisteredCache::ProcessFiles(const std::vector<NcaID>& ids) {

        if (file == nullptr)
            continue;
-        const auto nca = std::make_shared<NCA>(parser(file, id), nullptr, 0, keys);
+        const auto nca = std::make_shared<NCA>(parser(file, id), nullptr, 0);
        if (nca->GetStatus() != Loader::ResultStatus::Success ||
            nca->GetType() != NCAContentType::Meta) {
            continue;
@@ -486,7 +486,7 @@ std::unique_ptr<NCA> RegisteredCache::GetEntry(u64 title_id, ContentRecordType t
    const auto raw = GetEntryRaw(title_id, type);
    if (raw == nullptr)
        return nullptr;
-    return std::make_unique<NCA>(raw, nullptr, 0, keys);
+    return std::make_unique<NCA>(raw, nullptr, 0);
 }

 template <typename T>
@@ -865,7 +865,7 @@ std::unique_ptr<NCA> ManualContentProvider::GetEntry(u64 title_id, ContentRecord
    const auto res = GetEntryRaw(title_id, type);
    if (res == nullptr)
        return nullptr;
-    return std::make_unique<NCA>(res, nullptr, 0, keys);
+    return std::make_unique<NCA>(res, nullptr, 0);
 }

 std::vector<ContentProviderEntry> ManualContentProvider::ListEntriesFilter(
--- a/src/core/file_sys/registered_cache.h
+++ b/src/core/file_sys/registered_cache.h
@@ -88,7 +88,7 @@ public:

 protected:
    // A single instance of KeyManager to be used by GetEntry()
-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };

 class PlaceholderCache {
--- a/src/core/file_sys/sdmc_factory.cpp
+++ b/src/core/file_sys/sdmc_factory.cpp
@@ -10,6 +10,8 @@

 namespace FileSys {

+constexpr u64 SDMC_TOTAL_SIZE = 0x10000000000; // 1 TiB
+
 SDMCFactory::SDMCFactory(VirtualDir dir_)
    : dir(std::move(dir_)), contents(std::make_unique<RegisteredCache>(
                                GetOrCreateDirectoryRelative(dir, "/Nintendo/Contents/registered"),
@@ -46,7 +48,7 @@ u64 SDMCFactory::GetSDMCFreeSpace() const {
 }

 u64 SDMCFactory::GetSDMCTotalSpace() const {
-    return static_cast<u64>(Settings::values.sdmc_size);
+    return SDMC_TOTAL_SIZE;
 }

 } // namespace FileSys
--- a/src/core/file_sys/submission_package.cpp
+++ b/src/core/file_sys/submission_package.cpp
@@ -21,7 +21,7 @@
 namespace FileSys {
 namespace {
 void SetTicketKeys(const std::vector<VirtualFile>& files) {
-    Core::Crypto::KeyManager keys;
+    auto& keys = Core::Crypto::KeyManager::Instance();

    for (const auto& ticket_file : files) {
        if (ticket_file == nullptr) {
@@ -285,7 +285,7 @@ void NSP::ReadNCAs(const std::vector<VirtualFile>& files) {
                    continue;
                }

-                auto next_nca = std::make_shared<NCA>(std::move(next_file), nullptr, 0, keys);
+                auto next_nca = std::make_shared<NCA>(std::move(next_file), nullptr, 0);
                if (next_nca->GetType() == NCAContentType::Program) {
                    program_status[cnmt.GetTitleID()] = next_nca->GetStatus();
                }
--- a/src/core/file_sys/submission_package.h
+++ b/src/core/file_sys/submission_package.h
@@ -73,7 +73,7 @@ private:
    std::map<u64, std::map<std::pair<TitleType, ContentRecordType>, std::shared_ptr<NCA>>> ncas;
    std::vector<VirtualFile> ticket_files;

-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();

    VirtualFile romfs;
    VirtualDir exefs;
--- a/src/core/file_sys/system_archive/mii_model.cpp
+++ b/src/core/file_sys/system_archive/mii_model.cpp
@@ -40,7 +40,7 @@ VirtualDir MiiModel() {
    out->AddFile(std::make_shared<ArrayVfsFile<MiiModelData::SHAPE_MID.size()>>(
        MiiModelData::SHAPE_MID, "ShapeMid.dat"));

-    return std::move(out);
+    return out;
 }

 } // namespace FileSys::SystemArchive
--- a/src/core/file_sys/system_archive/shared_font.cpp
+++ b/src/core/file_sys/system_archive/shared_font.cpp
@@ -23,7 +23,7 @@ VirtualFile PackBFTTF(const std::array<u8, Size>& data, const std::string& name)

    std::vector<u8> bfttf(Size + sizeof(u64));

-    u64 offset = 0;
+    size_t offset = 0;
    Service::NS::EncryptSharedFont(vec, bfttf, offset);
    return std::make_shared<VectorVfsFile>(std::move(bfttf), name);
 }
--- a/src/core/file_sys/vfs_real.cpp
+++ b/src/core/file_sys/vfs_real.cpp
@@ -112,19 +112,26 @@ VirtualFile RealVfsFilesystem::MoveFile(std::string_view old_path_, std::string_
    const auto new_path =
        FileUtil::SanitizePath(new_path_, FileUtil::DirectorySeparator::PlatformDefault);

-    if (!FileUtil::Exists(old_path) || FileUtil::Exists(new_path) ||
-        FileUtil::IsDirectory(old_path) || !FileUtil::Rename(old_path, new_path))
-        return nullptr;
-
    if (cache.find(old_path) != cache.end()) {
-        auto cached = cache[old_path];
-        if (!cached.expired()) {
-            auto file = cached.lock();
-            file->Open(new_path, "r+b");
-            cache.erase(old_path);
-            cache[new_path] = file;
+        auto file = cache[old_path].lock();
+
+        if (!cache[old_path].expired()) {
+            file->Close();
        }
+
+        if (!FileUtil::Exists(old_path) || FileUtil::Exists(new_path) ||
+            FileUtil::IsDirectory(old_path) || !FileUtil::Rename(old_path, new_path)) {
+            return nullptr;
+        }
+
+        cache.erase(old_path);
+        file->Open(new_path, "r+b");
+        cache[new_path] = file;
+    } else {
+        UNREACHABLE();
+        return nullptr;
    }
+
    return OpenFile(new_path, Mode::ReadWrite);
 }

--- a/src/core/file_sys/xts_archive.h
+++ b/src/core/file_sys/xts_archive.h
@@ -62,6 +62,6 @@ private:

    VirtualFile dec_file;

-    Core::Crypto::KeyManager keys;
+    Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance();
 };
 } // namespace FileSys
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -29,7 +29,7 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) {

    const float window_aspect_ratio = static_cast<float>(height) / width;
    const float emulation_aspect_ratio = EmulationAspectRatio(
-        static_cast<AspectRatio>(Settings::values.aspect_ratio), window_aspect_ratio);
+        static_cast<AspectRatio>(Settings::values.aspect_ratio.GetValue()), window_aspect_ratio);

    const Common::Rectangle<u32> screen_window_area{0, 0, width, height};
    Common::Rectangle<u32> screen = MaxRectangle(screen_window_area, emulation_aspect_ratio);
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -35,7 +35,6 @@
 #include "common/swap.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
-#include "core/core_manager.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/kernel/memory/page_table.h"
 #include "core/hle/kernel/process.h"
--- a/src/core/hardware_properties.h
+++ b/src/core/hardware_properties.h
@@ -42,6 +42,10 @@ struct EmuThreadHandle {
        constexpr u32 invalid_handle = 0xFFFFFFFF;
        return {invalid_handle, invalid_handle};
    }
+
+    bool IsInvalid() const {
+        return (*this) == InvalidHandle();
+    }
 };

 } // namespace Core
--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -7,11 +7,15 @@

 #include "common/assert.h"
 #include "common/common_types.h"
+#include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/handle_table.h"
+#include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/result.h"
 #include "core/memory.h"

@@ -20,6 +24,7 @@ namespace Kernel {
 // Wake up num_to_wake (or all) threads in a vector.
 void AddressArbiter::WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads,
                                 s32 num_to_wake) {
+    auto& time_manager = system.Kernel().TimeManager();
    // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
    // them all.
    std::size_t last = waiting_threads.size();
@@ -29,12 +34,10 @@ void AddressArbiter::WakeThreads(const std::vector<std::shared_ptr<Thread>>& wai

    // Signal the waiting threads.
    for (std::size_t i = 0; i < last; i++) {
-        ASSERT(waiting_threads[i]->GetStatus() == ThreadStatus::WaitArb);
-        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
+        waiting_threads[i]->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
        RemoveThread(waiting_threads[i]);
-        waiting_threads[i]->SetArbiterWaitAddress(0);
+        waiting_threads[i]->WaitForArbitration(false);
        waiting_threads[i]->ResumeFromWait();
-        system.PrepareReschedule(waiting_threads[i]->GetProcessorID());
    }
 }

@@ -56,6 +59,7 @@ ResultCode AddressArbiter::SignalToAddress(VAddr address, SignalType type, s32 v
 }

 ResultCode AddressArbiter::SignalToAddressOnly(VAddr address, s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
    const std::vector<std::shared_ptr<Thread>> waiting_threads =
        GetThreadsWaitingOnAddress(address);
    WakeThreads(waiting_threads, num_to_wake);
@@ -64,6 +68,7 @@ ResultCode AddressArbiter::SignalToAddressOnly(VAddr address, s32 num_to_wake) {

 ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32 value,
                                                              s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
    auto& memory = system.Memory();

    // Ensure that we can write to the address.
@@ -71,16 +76,24 @@ ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32
        return ERR_INVALID_ADDRESS_STATE;
    }

-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
+    const std::size_t current_core = system.CurrentCoreIndex();
+    auto& monitor = system.Monitor();
+    u32 current_value;
+    do {
+        current_value = monitor.ExclusiveRead32(current_core, address);
+
+        if (current_value != value) {
+            return ERR_INVALID_STATE;
+        }
+        current_value++;
+    } while (!monitor.ExclusiveWrite32(current_core, address, current_value));

-    memory.Write32(address, static_cast<u32>(value + 1));
    return SignalToAddressOnly(address, num_to_wake);
 }

 ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
                                                                         s32 num_to_wake) {
+    SchedulerLock lock(system.Kernel());
    auto& memory = system.Memory();

    // Ensure that we can write to the address.
@@ -92,29 +105,33 @@ ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr a
    const std::vector<std::shared_ptr<Thread>> waiting_threads =
        GetThreadsWaitingOnAddress(address);

-    // Determine the modified value depending on the waiting count.
+    const std::size_t current_core = system.CurrentCoreIndex();
+    auto& monitor = system.Monitor();
    s32 updated_value;
-    if (num_to_wake <= 0) {
-        if (waiting_threads.empty()) {
-            updated_value = value + 1;
-        } else {
-            updated_value = value - 1;
-        }
-    } else {
-        if (waiting_threads.empty()) {
-            updated_value = value + 1;
-        } else if (waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
-            updated_value = value - 1;
-        } else {
-            updated_value = value;
-        }
-    }
+    do {
+        updated_value = monitor.ExclusiveRead32(current_core, address);

-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
+        if (updated_value != value) {
+            return ERR_INVALID_STATE;
+        }
+        // Determine the modified value depending on the waiting count.
+        if (num_to_wake <= 0) {
+            if (waiting_threads.empty()) {
+                updated_value = value + 1;
+            } else {
+                updated_value = value - 1;
+            }
+        } else {
+            if (waiting_threads.empty()) {
+                updated_value = value + 1;
+            } else if (waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
+                updated_value = value - 1;
+            } else {
+                updated_value = value;
+            }
+        }
+    } while (!monitor.ExclusiveWrite32(current_core, address, updated_value));

-    memory.Write32(address, static_cast<u32>(updated_value));
    WakeThreads(waiting_threads, num_to_wake);
    return RESULT_SUCCESS;
 }
@@ -136,60 +153,127 @@ ResultCode AddressArbiter::WaitForAddress(VAddr address, ArbitrationType type, s
 ResultCode AddressArbiter::WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
                                                    bool should_decrement) {
    auto& memory = system.Memory();
+    auto& kernel = system.Kernel();
+    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();

-    // Ensure that we can read the address.
-    if (!memory.IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
+    Handle event_handle = InvalidHandle;
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, current_thread, timeout);
+
+        if (current_thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return ERR_THREAD_TERMINATING;
+        }
+
+        // Ensure that we can read the address.
+        if (!memory.IsValidVirtualAddress(address)) {
+            lock.CancelSleep();
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        s32 current_value = static_cast<s32>(memory.Read32(address));
+        if (current_value >= value) {
+            lock.CancelSleep();
+            return ERR_INVALID_STATE;
+        }
+
+        current_thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+
+        s32 decrement_value;
+
+        const std::size_t current_core = system.CurrentCoreIndex();
+        auto& monitor = system.Monitor();
+        do {
+            current_value = static_cast<s32>(monitor.ExclusiveRead32(current_core, address));
+            if (should_decrement) {
+                decrement_value = current_value - 1;
+            } else {
+                decrement_value = current_value;
+            }
+        } while (
+            !monitor.ExclusiveWrite32(current_core, address, static_cast<u32>(decrement_value)));
+
+        // Short-circuit without rescheduling, if timeout is zero.
+        if (timeout == 0) {
+            lock.CancelSleep();
+            return RESULT_TIMEOUT;
+        }
+
+        current_thread->SetArbiterWaitAddress(address);
+        InsertThread(SharedFrom(current_thread));
+        current_thread->SetStatus(ThreadStatus::WaitArb);
+        current_thread->WaitForArbitration(true);
    }

-    const s32 cur_value = static_cast<s32>(memory.Read32(address));
-    if (cur_value >= value) {
-        return ERR_INVALID_STATE;
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
    }

-    if (should_decrement) {
-        memory.Write32(address, static_cast<u32>(cur_value - 1));
+    {
+        SchedulerLock lock(kernel);
+        if (current_thread->IsWaitingForArbitration()) {
+            RemoveThread(SharedFrom(current_thread));
+            current_thread->WaitForArbitration(false);
+        }
    }

-    // Short-circuit without rescheduling, if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
-    }
-
-    return WaitForAddressImpl(address, timeout);
+    return current_thread->GetSignalingResult();
 }

 ResultCode AddressArbiter::WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
    auto& memory = system.Memory();
-
-    // Ensure that we can read the address.
-    if (!memory.IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
-
-    // Only wait for the address if equal.
-    if (static_cast<s32>(memory.Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
-
-    // Short-circuit without rescheduling if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
-    }
-
-    return WaitForAddressImpl(address, timeout);
-}
-
-ResultCode AddressArbiter::WaitForAddressImpl(VAddr address, s64 timeout) {
+    auto& kernel = system.Kernel();
    Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
-    current_thread->SetArbiterWaitAddress(address);
-    InsertThread(SharedFrom(current_thread));
-    current_thread->SetStatus(ThreadStatus::WaitArb);
-    current_thread->InvalidateWakeupCallback();
-    current_thread->WakeAfterDelay(timeout);

-    system.PrepareReschedule(current_thread->GetProcessorID());
-    return RESULT_TIMEOUT;
+    Handle event_handle = InvalidHandle;
+    {
+        SchedulerLockAndSleep lock(kernel, event_handle, current_thread, timeout);
+
+        if (current_thread->IsPendingTermination()) {
+            lock.CancelSleep();
+            return ERR_THREAD_TERMINATING;
+        }
+
+        // Ensure that we can read the address.
+        if (!memory.IsValidVirtualAddress(address)) {
+            lock.CancelSleep();
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        s32 current_value = static_cast<s32>(memory.Read32(address));
+        if (current_value != value) {
+            lock.CancelSleep();
+            return ERR_INVALID_STATE;
+        }
+
+        // Short-circuit without rescheduling, if timeout is zero.
+        if (timeout == 0) {
+            lock.CancelSleep();
+            return RESULT_TIMEOUT;
+        }
+
+        current_thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        current_thread->SetArbiterWaitAddress(address);
+        InsertThread(SharedFrom(current_thread));
+        current_thread->SetStatus(ThreadStatus::WaitArb);
+        current_thread->WaitForArbitration(true);
+    }
+
+    if (event_handle != InvalidHandle) {
+        auto& time_manager = kernel.TimeManager();
+        time_manager.UnscheduleTimeEvent(event_handle);
+    }
+
+    {
+        SchedulerLock lock(kernel);
+        if (current_thread->IsWaitingForArbitration()) {
+            RemoveThread(SharedFrom(current_thread));
+            current_thread->WaitForArbitration(false);
+        }
+    }
+
+    return current_thread->GetSignalingResult();
 }

 void AddressArbiter::HandleWakeupThread(std::shared_ptr<Thread> thread) {
@@ -221,9 +305,9 @@ void AddressArbiter::RemoveThread(std::shared_ptr<Thread> thread) {
    const auto iter = std::find_if(thread_list.cbegin(), thread_list.cend(),
                                   [&thread](const auto& entry) { return thread == entry; });

-    ASSERT(iter != thread_list.cend());
-
-    thread_list.erase(iter);
+    if (iter != thread_list.cend()) {
+        thread_list.erase(iter);
+    }
 }

 std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(
--- a/src/core/hle/kernel/address_arbiter.h
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -73,9 +73,6 @@ private:
    /// Waits on an address if the value passed is equal to the argument value.
    ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);

-    // Waits on the given address with a timeout in nanoseconds
-    ResultCode WaitForAddressImpl(VAddr address, s64 timeout);
-
    /// Wake up num_to_wake (or all) threads in a vector.
    void WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads, s32 num_to_wake);

--- a/src/core/hle/kernel/client_port.cpp
+++ b/src/core/hle/kernel/client_port.cpp
@@ -34,7 +34,7 @@ ResultVal<std::shared_ptr<ClientSession>> ClientPort::Connect() {
    }

    // Wake the threads waiting on the ServerPort
-    server_port->WakeupAllWaitingThreads();
+    server_port->Signal();

    return MakeResult(std::move(client));
 }
--- a/src/core/hle/kernel/errors.h
+++ b/src/core/hle/kernel/errors.h
@@ -12,6 +12,7 @@ namespace Kernel {

 constexpr ResultCode ERR_MAX_CONNECTIONS_REACHED{ErrorModule::Kernel, 7};
 constexpr ResultCode ERR_INVALID_CAPABILITY_DESCRIPTOR{ErrorModule::Kernel, 14};
+constexpr ResultCode ERR_THREAD_TERMINATING{ErrorModule::Kernel, 59};
 constexpr ResultCode ERR_INVALID_SIZE{ErrorModule::Kernel, 101};
 constexpr ResultCode ERR_INVALID_ADDRESS{ErrorModule::Kernel, 102};
 constexpr ResultCode ERR_OUT_OF_RESOURCES{ErrorModule::Kernel, 103};
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -14,14 +14,17 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/server_session.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/memory.h"

@@ -46,15 +49,6 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
    const std::string& reason, u64 timeout, WakeupCallback&& callback,
    std::shared_ptr<WritableEvent> writable_event) {
    // Put the client thread to sleep until the wait event is signaled or the timeout expires.
-    thread->SetWakeupCallback(
-        [context = *this, callback](ThreadWakeupReason reason, std::shared_ptr<Thread> thread,
-                                    std::shared_ptr<SynchronizationObject> object,
-                                    std::size_t index) mutable -> bool {
-            ASSERT(thread->GetStatus() == ThreadStatus::WaitHLEEvent);
-            callback(thread, context, reason);
-            context.WriteToOutgoingCommandBuffer(*thread);
-            return true;
-        });

    if (!writable_event) {
        // Create event if not provided
@@ -62,14 +56,26 @@ std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
        writable_event = pair.writable;
    }

-    const auto readable_event{writable_event->GetReadableEvent()};
-    writable_event->Clear();
-    thread->SetStatus(ThreadStatus::WaitHLEEvent);
-    thread->SetSynchronizationObjects({readable_event});
-    readable_event->AddWaitingThread(thread);
-
-    if (timeout > 0) {
-        thread->WakeAfterDelay(timeout);
+    {
+        Handle event_handle = InvalidHandle;
+        SchedulerLockAndSleep lock(kernel, event_handle, thread.get(), timeout);
+        thread->SetHLECallback(
+            [context = *this, callback](std::shared_ptr<Thread> thread) mutable -> bool {
+                ThreadWakeupReason reason = thread->GetSignalingResult() == RESULT_TIMEOUT
+                                                ? ThreadWakeupReason::Timeout
+                                                : ThreadWakeupReason::Signal;
+                callback(thread, context, reason);
+                context.WriteToOutgoingCommandBuffer(*thread);
+                return true;
+            });
+        const auto readable_event{writable_event->GetReadableEvent()};
+        writable_event->Clear();
+        thread->SetHLESyncObject(readable_event.get());
+        thread->SetStatus(ThreadStatus::WaitHLEEvent);
+        thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        readable_event->AddWaitingThread(thread);
+        lock.Release();
+        thread->SetHLETimeEvent(event_handle);
    }

    is_thread_waiting = true;
@@ -282,18 +288,18 @@ ResultCode HLERequestContext::WriteToOutgoingCommandBuffer(Thread& thread) {
 }

 std::vector<u8> HLERequestContext::ReadBuffer(std::size_t buffer_index) const {
-    std::vector<u8> buffer;
+    std::vector<u8> buffer{};
    const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
                           BufferDescriptorA()[buffer_index].Size()};

    if (is_buffer_a) {
-        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
-                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorA().size() > buffer_index, { return buffer; },
+                              "BufferDescriptorA invalid buffer_index {}", buffer_index);
        buffer.resize(BufferDescriptorA()[buffer_index].Size());
        memory.ReadBlock(BufferDescriptorA()[buffer_index].Address(), buffer.data(), buffer.size());
    } else {
-        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
-                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorX().size() > buffer_index, { return buffer; },
+                              "BufferDescriptorX invalid buffer_index {}", buffer_index);
        buffer.resize(BufferDescriptorX()[buffer_index].Size());
        memory.ReadBlock(BufferDescriptorX()[buffer_index].Address(), buffer.data(), buffer.size());
    }
@@ -318,16 +324,16 @@ std::size_t HLERequestContext::WriteBuffer(const void* buffer, std::size_t size,
    }

    if (is_buffer_b) {
-        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
-                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorB()[buffer_index].Size() >= size,
-                   "BufferDescriptorB buffer_index {} is not large enough", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorB().size() > buffer_index &&
+                                  BufferDescriptorB()[buffer_index].Size() >= size,
+                              { return 0; }, "BufferDescriptorB is invalid, index={}, size={}",
+                              buffer_index, size);
        memory.WriteBlock(BufferDescriptorB()[buffer_index].Address(), buffer, size);
    } else {
-        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
-                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorC()[buffer_index].Size() >= size,
-                   "BufferDescriptorC buffer_index {} is not large enough", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorC().size() > buffer_index &&
+                                  BufferDescriptorC()[buffer_index].Size() >= size,
+                              { return 0; }, "BufferDescriptorC is invalid, index={}, size={}",
+                              buffer_index, size);
        memory.WriteBlock(BufferDescriptorC()[buffer_index].Address(), buffer, size);
    }

@@ -338,16 +344,12 @@ std::size_t HLERequestContext::GetReadBufferSize(std::size_t buffer_index) const
    const bool is_buffer_a{BufferDescriptorA().size() > buffer_index &&
                           BufferDescriptorA()[buffer_index].Size()};
    if (is_buffer_a) {
-        ASSERT_MSG(BufferDescriptorA().size() > buffer_index,
-                   "BufferDescriptorA invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorA()[buffer_index].Size() > 0,
-                   "BufferDescriptorA buffer_index {} is empty", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorA().size() > buffer_index, { return 0; },
+                              "BufferDescriptorA invalid buffer_index {}", buffer_index);
        return BufferDescriptorA()[buffer_index].Size();
    } else {
-        ASSERT_MSG(BufferDescriptorX().size() > buffer_index,
-                   "BufferDescriptorX invalid buffer_index {}", buffer_index);
-        ASSERT_MSG(BufferDescriptorX()[buffer_index].Size() > 0,
-                   "BufferDescriptorX buffer_index {} is empty", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorX().size() > buffer_index, { return 0; },
+                              "BufferDescriptorX invalid buffer_index {}", buffer_index);
        return BufferDescriptorX()[buffer_index].Size();
    }
 }
@@ -356,14 +358,15 @@ std::size_t HLERequestContext::GetWriteBufferSize(std::size_t buffer_index) cons
    const bool is_buffer_b{BufferDescriptorB().size() > buffer_index &&
                           BufferDescriptorB()[buffer_index].Size()};
    if (is_buffer_b) {
-        ASSERT_MSG(BufferDescriptorB().size() > buffer_index,
-                   "BufferDescriptorB invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorB().size() > buffer_index, { return 0; },
+                              "BufferDescriptorB invalid buffer_index {}", buffer_index);
        return BufferDescriptorB()[buffer_index].Size();
    } else {
-        ASSERT_MSG(BufferDescriptorC().size() > buffer_index,
-                   "BufferDescriptorC invalid buffer_index {}", buffer_index);
+        ASSERT_OR_EXECUTE_MSG(BufferDescriptorC().size() > buffer_index, { return 0; },
+                              "BufferDescriptorC invalid buffer_index {}", buffer_index);
        return BufferDescriptorC()[buffer_index].Size();
    }
+    return 0;
 }

 std::string HLERequestContext::Description() const {
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <array>
 #include <atomic>
 #include <bitset>
 #include <functional>
@@ -13,11 +14,15 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/arm/arm_interface.h"
+#include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/cpu_manager.h"
 #include "core/device_memory.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/client_port.h"
@@ -39,85 +44,28 @@
 #include "core/hle/result.h"
 #include "core/memory.h"

+MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
+
 namespace Kernel {

-/**
- * Callback that will wake up the thread it was scheduled for
- * @param thread_handle The handle of the thread that's been awoken
- * @param cycles_late The number of CPU cycles that have passed since the desired wakeup time
- */
-static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_late) {
-    const auto proper_handle = static_cast<Handle>(thread_handle);
-    const auto& system = Core::System::GetInstance();
-
-    // Lock the global kernel mutex when we enter the kernel HLE.
-    std::lock_guard lock{HLE::g_hle_lock};
-
-    std::shared_ptr<Thread> thread =
-        system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
-    if (thread == nullptr) {
-        LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle);
-        return;
-    }
-
-    bool resume = true;
-
-    if (thread->GetStatus() == ThreadStatus::WaitSynch ||
-        thread->GetStatus() == ThreadStatus::WaitHLEEvent) {
-        // Remove the thread from each of its waiting objects' waitlists
-        for (const auto& object : thread->GetSynchronizationObjects()) {
-            object->RemoveWaitingThread(thread);
-        }
-        thread->ClearSynchronizationObjects();
-
-        // Invoke the wakeup callback before clearing the wait objects
-        if (thread->HasWakeupCallback()) {
-            resume = thread->InvokeWakeupCallback(ThreadWakeupReason::Timeout, thread, nullptr, 0);
-        }
-    } else if (thread->GetStatus() == ThreadStatus::WaitMutex ||
-               thread->GetStatus() == ThreadStatus::WaitCondVar) {
-        thread->SetMutexWaitAddress(0);
-        thread->SetWaitHandle(0);
-        if (thread->GetStatus() == ThreadStatus::WaitCondVar) {
-            thread->GetOwnerProcess()->RemoveConditionVariableThread(thread);
-            thread->SetCondVarWaitAddress(0);
-        }
-
-        auto* const lock_owner = thread->GetLockOwner();
-        // Threads waking up by timeout from WaitProcessWideKey do not perform priority inheritance
-        // and don't have a lock owner unless SignalProcessWideKey was called first and the thread
-        // wasn't awakened due to the mutex already being acquired.
-        if (lock_owner != nullptr) {
-            lock_owner->RemoveMutexWaiter(thread);
-        }
-    }
-
-    if (thread->GetStatus() == ThreadStatus::WaitArb) {
-        auto& address_arbiter = thread->GetOwnerProcess()->GetAddressArbiter();
-        address_arbiter.HandleWakeupThread(thread);
-    }
-
-    if (resume) {
-        if (thread->GetStatus() == ThreadStatus::WaitCondVar ||
-            thread->GetStatus() == ThreadStatus::WaitArb) {
-            thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
-        }
-        thread->ResumeFromWait();
-    }
-}
-
 struct KernelCore::Impl {
    explicit Impl(Core::System& system, KernelCore& kernel)
        : global_scheduler{kernel}, synchronization{system}, time_manager{system}, system{system} {}

+    void SetMulticore(bool is_multicore) {
+        this->is_multicore = is_multicore;
+    }
+
    void Initialize(KernelCore& kernel) {
        Shutdown();
+        RegisterHostThread();

        InitializePhysicalCores();
        InitializeSystemResourceLimit(kernel);
        InitializeMemoryLayout();
-        InitializeThreads();
-        InitializePreemption();
+        InitializePreemption(kernel);
+        InitializeSchedulers();
+        InitializeSuspendThreads();
    }

    void Shutdown() {
@@ -126,13 +74,26 @@ struct KernelCore::Impl {
        next_user_process_id = Process::ProcessIDMin;
        next_thread_id = 1;

+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            if (suspend_threads[i]) {
+                suspend_threads[i].reset();
+            }
+        }
+
+        for (std::size_t i = 0; i < cores.size(); i++) {
+            cores[i].Shutdown();
+            schedulers[i].reset();
+        }
+        cores.clear();
+
+        registered_core_threads.reset();
+
        process_list.clear();
        current_process = nullptr;

        system_resource_limit = nullptr;

        global_handle_table.Clear();
-        thread_wakeup_event_type = nullptr;
        preemption_event = nullptr;

        global_scheduler.Shutdown();
@@ -145,13 +106,21 @@ struct KernelCore::Impl {
        cores.clear();

        exclusive_monitor.reset();
+        host_thread_ids.clear();
    }

    void InitializePhysicalCores() {
        exclusive_monitor =
            Core::MakeExclusiveMonitor(system.Memory(), Core::Hardware::NUM_CPU_CORES);
        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
-            cores.emplace_back(system, i, *exclusive_monitor);
+            schedulers[i] = std::make_unique<Kernel::Scheduler>(system, i);
+            cores.emplace_back(system, i, *schedulers[i], interrupts[i]);
+        }
+    }
+
+    void InitializeSchedulers() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            cores[i].Scheduler().Initialize();
        }
    }

@@ -173,15 +142,13 @@ struct KernelCore::Impl {
        }
    }

-    void InitializeThreads() {
-        thread_wakeup_event_type =
-            Core::Timing::CreateEvent("ThreadWakeupCallback", ThreadWakeupCallback);
-    }
-
-    void InitializePreemption() {
-        preemption_event =
-            Core::Timing::CreateEvent("PreemptionCallback", [this](u64 userdata, s64 cycles_late) {
-                global_scheduler.PreemptThreads();
+    void InitializePreemption(KernelCore& kernel) {
+        preemption_event = Core::Timing::CreateEvent(
+            "PreemptionCallback", [this, &kernel](u64 userdata, s64 cycles_late) {
+                {
+                    SchedulerLock lock(kernel);
+                    global_scheduler.PreemptThreads();
+                }
                s64 time_interval = Core::Timing::msToCycles(std::chrono::milliseconds(10));
                system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
            });
@@ -190,6 +157,20 @@ struct KernelCore::Impl {
        system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
    }

+    void InitializeSuspendThreads() {
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            std::string name = "Suspend Thread Id:" + std::to_string(i);
+            std::function<void(void*)> init_func =
+                system.GetCpuManager().GetSuspendThreadStartFunc();
+            void* init_func_parameter = system.GetCpuManager().GetStartFuncParamater();
+            ThreadType type =
+                static_cast<ThreadType>(THREADTYPE_KERNEL | THREADTYPE_HLE | THREADTYPE_SUSPEND);
+            auto thread_res = Thread::Create(system, type, name, 0, 0, 0, static_cast<u32>(i), 0,
+                                             nullptr, std::move(init_func), init_func_parameter);
+            suspend_threads[i] = std::move(thread_res).Unwrap();
+        }
+    }
+
    void MakeCurrentProcess(Process* process) {
        current_process = process;

@@ -197,15 +178,17 @@ struct KernelCore::Impl {
            return;
        }

-        for (auto& core : cores) {
-            core.SetIs64Bit(process->Is64BitProcess());
+        u32 core_id = GetCurrentHostThreadID();
+        if (core_id < Core::Hardware::NUM_CPU_CORES) {
+            system.Memory().SetCurrentPageTable(*process, core_id);
        }
-
-        system.Memory().SetCurrentPageTable(*process);
    }

    void RegisterCoreThread(std::size_t core_id) {
        std::unique_lock lock{register_thread_mutex};
+        if (!is_multicore) {
+            single_core_thread_id = std::this_thread::get_id();
+        }
        const std::thread::id this_id = std::this_thread::get_id();
        const auto it = host_thread_ids.find(this_id);
        ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
@@ -219,12 +202,19 @@ struct KernelCore::Impl {
        std::unique_lock lock{register_thread_mutex};
        const std::thread::id this_id = std::this_thread::get_id();
        const auto it = host_thread_ids.find(this_id);
-        ASSERT(it == host_thread_ids.end());
+        if (it != host_thread_ids.end()) {
+            return;
+        }
        host_thread_ids[this_id] = registered_thread_ids++;
    }

    u32 GetCurrentHostThreadID() const {
        const std::thread::id this_id = std::this_thread::get_id();
+        if (!is_multicore) {
+            if (single_core_thread_id == this_id) {
+                return static_cast<u32>(system.GetCpuManager().CurrentCore());
+            }
+        }
        const auto it = host_thread_ids.find(this_id);
        if (it == host_thread_ids.end()) {
            return Core::INVALID_HOST_THREAD_ID;
@@ -240,7 +230,7 @@ struct KernelCore::Impl {
        }
        const Kernel::Scheduler& sched = cores[result.host_handle].Scheduler();
        const Kernel::Thread* current = sched.GetCurrentThread();
-        if (current != nullptr) {
+        if (current != nullptr && !current->IsPhantomMode()) {
            result.guest_handle = current->GetGlobalHandle();
        } else {
            result.guest_handle = InvalidHandle;
@@ -313,7 +303,6 @@ struct KernelCore::Impl {

    std::shared_ptr<ResourceLimit> system_resource_limit;

-    std::shared_ptr<Core::Timing::EventType> thread_wakeup_event_type;
    std::shared_ptr<Core::Timing::EventType> preemption_event;

    // This is the kernel's handle table or supervisor handle table which
@@ -343,6 +332,15 @@ struct KernelCore::Impl {
    std::shared_ptr<Kernel::SharedMemory> irs_shared_mem;
    std::shared_ptr<Kernel::SharedMemory> time_shared_mem;

+    std::array<std::shared_ptr<Thread>, Core::Hardware::NUM_CPU_CORES> suspend_threads{};
+    std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES> interrupts{};
+    std::array<std::unique_ptr<Kernel::Scheduler>, Core::Hardware::NUM_CPU_CORES> schedulers{};
+
+    bool is_multicore{};
+    std::thread::id single_core_thread_id{};
+
+    std::array<u64, Core::Hardware::NUM_CPU_CORES> svc_ticks{};
+
    // System context
    Core::System& system;
 };
@@ -352,6 +350,10 @@ KernelCore::~KernelCore() {
    Shutdown();
 }

+void KernelCore::SetMulticore(bool is_multicore) {
+    impl->SetMulticore(is_multicore);
+}
+
 void KernelCore::Initialize() {
    impl->Initialize(*this);
 }
@@ -397,11 +399,11 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
 }

 Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) {
-    return impl->cores[id].Scheduler();
+    return *impl->schedulers[id];
 }

 const Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) const {
-    return impl->cores[id].Scheduler();
+    return *impl->schedulers[id];
 }

 Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) {
@@ -412,6 +414,39 @@ const Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) const {
    return impl->cores[id];
 }

+Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+const Kernel::PhysicalCore& KernelCore::CurrentPhysicalCore() const {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return impl->cores[core_id];
+}
+
+Kernel::Scheduler& KernelCore::CurrentScheduler() {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return *impl->schedulers[core_id];
+}
+
+const Kernel::Scheduler& KernelCore::CurrentScheduler() const {
+    u32 core_id = impl->GetCurrentHostThreadID();
+    ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+    return *impl->schedulers[core_id];
+}
+
+std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& KernelCore::Interrupts() {
+    return impl->interrupts;
+}
+
+const std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& KernelCore::Interrupts()
+    const {
+    return impl->interrupts;
+}
+
 Kernel::Synchronization& KernelCore::Synchronization() {
    return impl->synchronization;
 }
@@ -437,15 +472,17 @@ const Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() const {
 }

 void KernelCore::InvalidateAllInstructionCaches() {
-    for (std::size_t i = 0; i < impl->global_scheduler.CpuCoresCount(); i++) {
-        PhysicalCore(i).ArmInterface().ClearInstructionCache();
+    auto& threads = GlobalScheduler().GetThreadList();
+    for (auto& thread : threads) {
+        if (!thread->IsHLEThread()) {
+            auto& arm_interface = thread->ArmInterface();
+            arm_interface.ClearInstructionCache();
+        }
    }
 }

 void KernelCore::PrepareReschedule(std::size_t id) {
-    if (id < impl->global_scheduler.CpuCoresCount()) {
-        impl->cores[id].Stop();
-    }
+    // TODO: Reimplement, this
 }

 void KernelCore::AddNamedPort(std::string name, std::shared_ptr<ClientPort> port) {
@@ -481,10 +518,6 @@ u64 KernelCore::CreateNewUserProcessID() {
    return impl->next_user_process_id++;
 }

-const std::shared_ptr<Core::Timing::EventType>& KernelCore::ThreadWakeupCallbackEventType() const {
-    return impl->thread_wakeup_event_type;
-}
-
 Kernel::HandleTable& KernelCore::GlobalHandleTable() {
    return impl->global_handle_table;
 }
@@ -557,4 +590,34 @@ const Kernel::SharedMemory& KernelCore::GetTimeSharedMem() const {
    return *impl->time_shared_mem;
 }

+void KernelCore::Suspend(bool in_suspention) {
+    const bool should_suspend = exception_exited || in_suspention;
+    {
+        SchedulerLock lock(*this);
+        ThreadStatus status = should_suspend ? ThreadStatus::Ready : ThreadStatus::WaitSleep;
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+            impl->suspend_threads[i]->SetStatus(status);
+        }
+    }
+}
+
+bool KernelCore::IsMulticore() const {
+    return impl->is_multicore;
+}
+
+void KernelCore::ExceptionalExit() {
+    exception_exited = true;
+    Suspend(true);
+}
+
+void KernelCore::EnterSVCProfile() {
+    std::size_t core = impl->GetCurrentHostThreadID();
+    impl->svc_ticks[core] = MicroProfileEnter(MICROPROFILE_TOKEN(Kernel_SVC));
+}
+
+void KernelCore::ExitSVCProfile() {
+    std::size_t core = impl->GetCurrentHostThreadID();
+    MicroProfileLeave(MICROPROFILE_TOKEN(Kernel_SVC), impl->svc_ticks[core]);
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -4,15 +4,17 @@

 #pragma once

+#include <array>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/memory/memory_types.h"
 #include "core/hle/kernel/object.h"

 namespace Core {
-struct EmuThreadHandle;
+class CPUInterruptHandler;
 class ExclusiveMonitor;
 class System;
 } // namespace Core
@@ -65,6 +67,9 @@ public:
    KernelCore(KernelCore&&) = delete;
    KernelCore& operator=(KernelCore&&) = delete;

+    /// Sets if emulation is multicore or single core, must be set before Initialize
+    void SetMulticore(bool is_multicore);
+
    /// Resets the kernel to a clean slate for use.
    void Initialize();

@@ -110,6 +115,18 @@ public:
    /// Gets the an instance of the respective physical CPU core.
    const Kernel::PhysicalCore& PhysicalCore(std::size_t id) const;

+    /// Gets the sole instance of the Scheduler at the current running core.
+    Kernel::Scheduler& CurrentScheduler();
+
+    /// Gets the sole instance of the Scheduler at the current running core.
+    const Kernel::Scheduler& CurrentScheduler() const;
+
+    /// Gets the an instance of the current physical CPU core.
+    Kernel::PhysicalCore& CurrentPhysicalCore();
+
+    /// Gets the an instance of the current physical CPU core.
+    const Kernel::PhysicalCore& CurrentPhysicalCore() const;
+
    /// Gets the an instance of the Synchronization Interface.
    Kernel::Synchronization& Synchronization();

@@ -129,6 +146,10 @@ public:

    const Core::ExclusiveMonitor& GetExclusiveMonitor() const;

+    std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& Interrupts();
+
+    const std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES>& Interrupts() const;
+
    void InvalidateAllInstructionCaches();

    /// Adds a port to the named port table
@@ -191,6 +212,18 @@ public:
    /// Gets the shared memory object for Time services.
    const Kernel::SharedMemory& GetTimeSharedMem() const;

+    /// Suspend/unsuspend the OS.
+    void Suspend(bool in_suspention);
+
+    /// Exceptional exit the OS.
+    void ExceptionalExit();
+
+    bool IsMulticore() const;
+
+    void EnterSVCProfile();
+
+    void ExitSVCProfile();
+
 private:
    friend class Object;
    friend class Process;
@@ -208,9 +241,6 @@ private:
    /// Creates a new thread ID, incrementing the internal thread ID counter.
    u64 CreateNewThreadID();

-    /// Retrieves the event type used for thread wakeup callbacks.
-    const std::shared_ptr<Core::Timing::EventType>& ThreadWakeupCallbackEventType() const;
-
    /// Provides a reference to the global handle table.
    Kernel::HandleTable& GlobalHandleTable();

@@ -219,6 +249,7 @@ private:

    struct Impl;
    std::unique_ptr<Impl> impl;
+    bool exception_exited{};
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/memory/memory_manager.cpp
+++ b/src/core/hle/kernel/memory/memory_manager.cpp
@@ -104,7 +104,7 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
    // Ensure that we don't leave anything un-freed
    auto group_guard = detail::ScopeExit([&] {
        for (const auto& it : page_list.Nodes()) {
-            const auto min_num_pages{std::min(
+            const auto min_num_pages{std::min<size_t>(
                it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
            chosen_manager.Free(it.GetAddress(), min_num_pages);
        }
@@ -139,7 +139,6 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
    }

    // Only succeed if we allocated as many pages as we wanted
-    ASSERT(num_pages >= 0);
    if (num_pages) {
        return ERR_OUT_OF_MEMORY;
    }
@@ -165,7 +164,7 @@ ResultCode MemoryManager::Free(PageLinkedList& page_list, std::size_t num_pages,

    // Free all of the pages
    for (const auto& it : page_list.Nodes()) {
-        const auto min_num_pages{std::min(
+        const auto min_num_pages{std::min<size_t>(
            it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
        chosen_manager.Free(it.GetAddress(), min_num_pages);
    }
--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -34,8 +34,6 @@ static std::pair<std::shared_ptr<Thread>, u32> GetHighestPriorityMutexWaitingThr
        if (thread->GetMutexWaitAddress() != mutex_addr)
            continue;

-        ASSERT(thread->GetStatus() == ThreadStatus::WaitMutex);
-
        ++num_waiters;
        if (highest_priority_thread == nullptr ||
            thread->GetPriority() < highest_priority_thread->GetPriority()) {
@@ -49,6 +47,7 @@ static std::pair<std::shared_ptr<Thread>, u32> GetHighestPriorityMutexWaitingThr
 /// Update the mutex owner field of all threads waiting on the mutex to point to the new owner.
 static void TransferMutexOwnership(VAddr mutex_addr, std::shared_ptr<Thread> current_thread,
                                   std::shared_ptr<Thread> new_owner) {
+    current_thread->RemoveMutexWaiter(new_owner);
    const auto threads = current_thread->GetMutexWaitingThreads();
    for (const auto& thread : threads) {
        if (thread->GetMutexWaitAddress() != mutex_addr)
@@ -72,85 +71,100 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
        return ERR_INVALID_ADDRESS;
    }

-    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
+    auto& kernel = system.Kernel();
    std::shared_ptr<Thread> current_thread =
-        SharedFrom(system.CurrentScheduler().GetCurrentThread());
-    std::shared_ptr<Thread> holding_thread = handle_table.Get<Thread>(holding_thread_handle);
-    std::shared_ptr<Thread> requesting_thread = handle_table.Get<Thread>(requesting_thread_handle);
+        SharedFrom(kernel.CurrentScheduler().GetCurrentThread());
+    {
+        SchedulerLock lock(kernel);
+        // The mutex address must be 4-byte aligned
+        if ((address % sizeof(u32)) != 0) {
+            return ERR_INVALID_ADDRESS;
+        }

-    // TODO(Subv): It is currently unknown if it is possible to lock a mutex in behalf of another
-    // thread.
-    ASSERT(requesting_thread == current_thread);
+        const auto& handle_table = kernel.CurrentProcess()->GetHandleTable();
+        std::shared_ptr<Thread> holding_thread = handle_table.Get<Thread>(holding_thread_handle);
+        std::shared_ptr<Thread> requesting_thread =
+            handle_table.Get<Thread>(requesting_thread_handle);

-    const u32 addr_value = system.Memory().Read32(address);
+        // TODO(Subv): It is currently unknown if it is possible to lock a mutex in behalf of
+        // another thread.
+        ASSERT(requesting_thread == current_thread);

-    // If the mutex isn't being held, just return success.
-    if (addr_value != (holding_thread_handle | Mutex::MutexHasWaitersFlag)) {
-        return RESULT_SUCCESS;
+        current_thread->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
+
+        const u32 addr_value = system.Memory().Read32(address);
+
+        // If the mutex isn't being held, just return success.
+        if (addr_value != (holding_thread_handle | Mutex::MutexHasWaitersFlag)) {
+            return RESULT_SUCCESS;
+        }
+
+        if (holding_thread == nullptr) {
+            return ERR_INVALID_HANDLE;
+        }
+
+        // Wait until the mutex is released
+        current_thread->SetMutexWaitAddress(address);
+        current_thread->SetWaitHandle(requesting_thread_handle);
+
+        current_thread->SetStatus(ThreadStatus::WaitMutex);
+
+        // Update the lock holder thread's priority to prevent priority inversion.
+        holding_thread->AddMutexWaiter(current_thread);
    }

-    if (holding_thread == nullptr) {
-        LOG_ERROR(Kernel, "Holding thread does not exist! thread_handle={:08X}",
-                  holding_thread_handle);
-        return ERR_INVALID_HANDLE;
+    {
+        SchedulerLock lock(kernel);
+        auto* owner = current_thread->GetLockOwner();
+        if (owner != nullptr) {
+            owner->RemoveMutexWaiter(current_thread);
+        }
    }
-
-    // Wait until the mutex is released
-    current_thread->SetMutexWaitAddress(address);
-    current_thread->SetWaitHandle(requesting_thread_handle);
-
-    current_thread->SetStatus(ThreadStatus::WaitMutex);
-    current_thread->InvalidateWakeupCallback();
-
-    // Update the lock holder thread's priority to prevent priority inversion.
-    holding_thread->AddMutexWaiter(current_thread);
-
-    system.PrepareReschedule();
-
-    return RESULT_SUCCESS;
+    return current_thread->GetSignalingResult();
 }

-ResultCode Mutex::Release(VAddr address) {
+std::pair<ResultCode, std::shared_ptr<Thread>> Mutex::Unlock(std::shared_ptr<Thread> owner,
+                                                             VAddr address) {
    // The mutex address must be 4-byte aligned
    if ((address % sizeof(u32)) != 0) {
        LOG_ERROR(Kernel, "Address is not 4-byte aligned! address={:016X}", address);
-        return ERR_INVALID_ADDRESS;
+        return {ERR_INVALID_ADDRESS, nullptr};
    }

-    std::shared_ptr<Thread> current_thread =
-        SharedFrom(system.CurrentScheduler().GetCurrentThread());
-    auto [thread, num_waiters] = GetHighestPriorityMutexWaitingThread(current_thread, address);
-
-    // There are no more threads waiting for the mutex, release it completely.
-    if (thread == nullptr) {
+    auto [new_owner, num_waiters] = GetHighestPriorityMutexWaitingThread(owner, address);
+    if (new_owner == nullptr) {
        system.Memory().Write32(address, 0);
-        return RESULT_SUCCESS;
+        return {RESULT_SUCCESS, nullptr};
    }
-
    // Transfer the ownership of the mutex from the previous owner to the new one.
-    TransferMutexOwnership(address, current_thread, thread);
-
-    u32 mutex_value = thread->GetWaitHandle();
-
+    TransferMutexOwnership(address, owner, new_owner);
+    u32 mutex_value = new_owner->GetWaitHandle();
    if (num_waiters >= 2) {
        // Notify the guest that there are still some threads waiting for the mutex
        mutex_value |= Mutex::MutexHasWaitersFlag;
    }
+    new_owner->SetSynchronizationResults(nullptr, RESULT_SUCCESS);
+    new_owner->SetLockOwner(nullptr);
+    new_owner->ResumeFromWait();

-    // Grant the mutex to the next waiting thread and resume it.
    system.Memory().Write32(address, mutex_value);
-
-    ASSERT(thread->GetStatus() == ThreadStatus::WaitMutex);
-    thread->ResumeFromWait();
-
-    thread->SetLockOwner(nullptr);
-    thread->SetCondVarWaitAddress(0);
-    thread->SetMutexWaitAddress(0);
-    thread->SetWaitHandle(0);
-    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
-
-    system.PrepareReschedule();
-
-    return RESULT_SUCCESS;
+    return {RESULT_SUCCESS, new_owner};
 }
+
+ResultCode Mutex::Release(VAddr address) {
+    auto& kernel = system.Kernel();
+    SchedulerLock lock(kernel);
+
+    std::shared_ptr<Thread> current_thread =
+        SharedFrom(kernel.CurrentScheduler().GetCurrentThread());
+
+    auto [result, new_owner] = Unlock(current_thread, address);
+
+    if (result != RESULT_SUCCESS && new_owner != nullptr) {
+        new_owner->SetSynchronizationResults(nullptr, result);
+    }
+
+    return result;
+}
+
 } // namespace Kernel
--- a/Show More
+++ b/Show More