vk_reasterizer: fix mistype on SetupGraphicsImages

This should use Maxwell3D engine. Fixed some GPU error on Kirby and maybe other games.
Merge pull request #3481 from ReinUsesLisp/abgr5-storage
2020-03-08 10:06:59 +07:00 · 2020-03-07 19:51:33 -05:00 · 2020-03-06 18:47:27 -03:00 · 2020-03-05 19:38:42 -05:00 · 2020-03-05 15:09:48 -05:00 · 2020-03-05 11:42:46 -05:00
134 changed files with 5926 additions and 1893 deletions
@@ -157,8 +157,14 @@ if (ENABLE_SDL2)
        target_include_directories(SDL2 INTERFACE "${SDL2_INCLUDE_DIR}")
    else()
        find_package(SDL2 REQUIRED)
-        include_directories(${SDL2_INCLUDE_DIRS})

+        # Some installations don't set SDL2_LIBRARIES
+        if("${SDL2_LIBRARIES}" STREQUAL "")
+            message(WARNING "SDL2_LIBRARIES wasn't set, manually setting to SDL2::SDL2")
+            set(SDL2_LIBRARIES "SDL2::SDL2")
+        endif()
+
+        include_directories(${SDL2_INCLUDE_DIRS})
        add_library(SDL2 INTERFACE)
        target_link_libraries(SDL2 INTERFACE "${SDL2_LIBRARIES}")
    endif()
@@ -1,6 +1,6 @@
 yuzu emulator
 =============
-[![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
+[![Travis CI Build Status](https://travis-ci.com/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.com/yuzu-emu/yuzu)
 [![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).
@@ -21,7 +21,7 @@ For development discussion, please join us on [Discord](https://discord.gg/XQV6d

 Most of the development happens on GitHub. It's also where [our central repository](https://github.com/yuzu-emu/yuzu) is hosted.

-If you want to contribute please take a look at the [Contributor's Guide](CONTRIBUTING.md) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should as well contact any of the developers on Discord in order to know about the current state of the emulator.
+If you want to contribute please take a look at the [Contributor's Guide](CONTRIBUTING.md) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should also contact any of the developers on Discord in order to know about the current state of the emulator.

 ### Building

@@ -1,4 +1,4 @@
-From https://github.com/yhirose/cpp-httplib/commit/d9479bc0b12e8a1e8bce2d34da4feeef488581f3
+From https://github.com/yhirose/cpp-httplib/tree/fce8e6fefdab4ad48bc5b25c98e5ebfda4f3cf53

 MIT License

@@ -5,21 +5,141 @@
 #define _USE_MATH_DEFINES

 #include <algorithm>
+#include <climits>
 #include <cmath>
 #include <vector>
+
 #include "audio_core/algorithm/interpolate.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"

 namespace AudioCore {

-/// The Lanczos kernel
-static double Lanczos(std::size_t a, double x) {
-    if (x == 0.0)
-        return 1.0;
-    const double px = M_PI * x;
-    return a * std::sin(px) * std::sin(px / a) / (px * px);
-}
+constexpr std::array<s16, 512> curve_lut0{
+    6600,  19426, 6722,  3,     6479,  19424, 6845,  9,     6359,  19419, 6968,  15,    6239,
+    19412, 7093,  22,    6121,  19403, 7219,  28,    6004,  19391, 7345,  34,    5888,  19377,
+    7472,  41,    5773,  19361, 7600,  48,    5659,  19342, 7728,  55,    5546,  19321, 7857,
+    62,    5434,  19298, 7987,  69,    5323,  19273, 8118,  77,    5213,  19245, 8249,  84,
+    5104,  19215, 8381,  92,    4997,  19183, 8513,  101,   4890,  19148, 8646,  109,   4785,
+    19112, 8780,  118,   4681,  19073, 8914,  127,   4579,  19031, 9048,  137,   4477,  18988,
+    9183,  147,   4377,  18942, 9318,  157,   4277,  18895, 9454,  168,   4179,  18845, 9590,
+    179,   4083,  18793, 9726,  190,   3987,  18738, 9863,  202,   3893,  18682, 10000, 215,
+    3800,  18624, 10137, 228,   3709,  18563, 10274, 241,   3618,  18500, 10411, 255,   3529,
+    18436, 10549, 270,   3441,  18369, 10687, 285,   3355,  18300, 10824, 300,   3269,  18230,
+    10962, 317,   3186,  18157, 11100, 334,   3103,  18082, 11238, 351,   3022,  18006, 11375,
+    369,   2942,  17927, 11513, 388,   2863,  17847, 11650, 408,   2785,  17765, 11788, 428,
+    2709,  17681, 11925, 449,   2635,  17595, 12062, 471,   2561,  17507, 12198, 494,   2489,
+    17418, 12334, 517,   2418,  17327, 12470, 541,   2348,  17234, 12606, 566,   2280,  17140,
+    12741, 592,   2213,  17044, 12876, 619,   2147,  16946, 13010, 647,   2083,  16846, 13144,
+    675,   2020,  16745, 13277, 704,   1958,  16643, 13409, 735,   1897,  16539, 13541, 766,
+    1838,  16434, 13673, 798,   1780,  16327, 13803, 832,   1723,  16218, 13933, 866,   1667,
+    16109, 14062, 901,   1613,  15998, 14191, 937,   1560,  15885, 14318, 975,   1508,  15772,
+    14445, 1013,  1457,  15657, 14571, 1052,  1407,  15540, 14695, 1093,  1359,  15423, 14819,
+    1134,  1312,  15304, 14942, 1177,  1266,  15185, 15064, 1221,  1221,  15064, 15185, 1266,
+    1177,  14942, 15304, 1312,  1134,  14819, 15423, 1359,  1093,  14695, 15540, 1407,  1052,
+    14571, 15657, 1457,  1013,  14445, 15772, 1508,  975,   14318, 15885, 1560,  937,   14191,
+    15998, 1613,  901,   14062, 16109, 1667,  866,   13933, 16218, 1723,  832,   13803, 16327,
+    1780,  798,   13673, 16434, 1838,  766,   13541, 16539, 1897,  735,   13409, 16643, 1958,
+    704,   13277, 16745, 2020,  675,   13144, 16846, 2083,  647,   13010, 16946, 2147,  619,
+    12876, 17044, 2213,  592,   12741, 17140, 2280,  566,   12606, 17234, 2348,  541,   12470,
+    17327, 2418,  517,   12334, 17418, 2489,  494,   12198, 17507, 2561,  471,   12062, 17595,
+    2635,  449,   11925, 17681, 2709,  428,   11788, 17765, 2785,  408,   11650, 17847, 2863,
+    388,   11513, 17927, 2942,  369,   11375, 18006, 3022,  351,   11238, 18082, 3103,  334,
+    11100, 18157, 3186,  317,   10962, 18230, 3269,  300,   10824, 18300, 3355,  285,   10687,
+    18369, 3441,  270,   10549, 18436, 3529,  255,   10411, 18500, 3618,  241,   10274, 18563,
+    3709,  228,   10137, 18624, 3800,  215,   10000, 18682, 3893,  202,   9863,  18738, 3987,
+    190,   9726,  18793, 4083,  179,   9590,  18845, 4179,  168,   9454,  18895, 4277,  157,
+    9318,  18942, 4377,  147,   9183,  18988, 4477,  137,   9048,  19031, 4579,  127,   8914,
+    19073, 4681,  118,   8780,  19112, 4785,  109,   8646,  19148, 4890,  101,   8513,  19183,
+    4997,  92,    8381,  19215, 5104,  84,    8249,  19245, 5213,  77,    8118,  19273, 5323,
+    69,    7987,  19298, 5434,  62,    7857,  19321, 5546,  55,    7728,  19342, 5659,  48,
+    7600,  19361, 5773,  41,    7472,  19377, 5888,  34,    7345,  19391, 6004,  28,    7219,
+    19403, 6121,  22,    7093,  19412, 6239,  15,    6968,  19419, 6359,  9,     6845,  19424,
+    6479,  3,     6722,  19426, 6600};
+
+constexpr std::array<s16, 512> curve_lut1{
+    -68,   32639, 69,    -5,    -200,  32630, 212,   -15,   -328,  32613, 359,   -26,   -450,
+    32586, 512,   -36,   -568,  32551, 669,   -47,   -680,  32507, 832,   -58,   -788,  32454,
+    1000,  -69,   -891,  32393, 1174,  -80,   -990,  32323, 1352,  -92,   -1084, 32244, 1536,
+    -103,  -1173, 32157, 1724,  -115,  -1258, 32061, 1919,  -128,  -1338, 31956, 2118,  -140,
+    -1414, 31844, 2322,  -153,  -1486, 31723, 2532,  -167,  -1554, 31593, 2747,  -180,  -1617,
+    31456, 2967,  -194,  -1676, 31310, 3192,  -209,  -1732, 31157, 3422,  -224,  -1783, 30995,
+    3657,  -240,  -1830, 30826, 3897,  -256,  -1874, 30649, 4143,  -272,  -1914, 30464, 4393,
+    -289,  -1951, 30272, 4648,  -307,  -1984, 30072, 4908,  -325,  -2014, 29866, 5172,  -343,
+    -2040, 29652, 5442,  -362,  -2063, 29431, 5716,  -382,  -2083, 29203, 5994,  -403,  -2100,
+    28968, 6277,  -424,  -2114, 28727, 6565,  -445,  -2125, 28480, 6857,  -468,  -2133, 28226,
+    7153,  -490,  -2139, 27966, 7453,  -514,  -2142, 27700, 7758,  -538,  -2142, 27428, 8066,
+    -563,  -2141, 27151, 8378,  -588,  -2136, 26867, 8694,  -614,  -2130, 26579, 9013,  -641,
+    -2121, 26285, 9336,  -668,  -2111, 25987, 9663,  -696,  -2098, 25683, 9993,  -724,  -2084,
+    25375, 10326, -753,  -2067, 25063, 10662, -783,  -2049, 24746, 11000, -813,  -2030, 24425,
+    11342, -844,  -2009, 24100, 11686, -875,  -1986, 23771, 12033, -907,  -1962, 23438, 12382,
+    -939,  -1937, 23103, 12733, -972,  -1911, 22764, 13086, -1005, -1883, 22422, 13441, -1039,
+    -1855, 22077, 13798, -1072, -1825, 21729, 14156, -1107, -1795, 21380, 14516, -1141, -1764,
+    21027, 14877, -1176, -1732, 20673, 15239, -1211, -1700, 20317, 15602, -1246, -1667, 19959,
+    15965, -1282, -1633, 19600, 16329, -1317, -1599, 19239, 16694, -1353, -1564, 18878, 17058,
+    -1388, -1530, 18515, 17423, -1424, -1495, 18151, 17787, -1459, -1459, 17787, 18151, -1495,
+    -1424, 17423, 18515, -1530, -1388, 17058, 18878, -1564, -1353, 16694, 19239, -1599, -1317,
+    16329, 19600, -1633, -1282, 15965, 19959, -1667, -1246, 15602, 20317, -1700, -1211, 15239,
+    20673, -1732, -1176, 14877, 21027, -1764, -1141, 14516, 21380, -1795, -1107, 14156, 21729,
+    -1825, -1072, 13798, 22077, -1855, -1039, 13441, 22422, -1883, -1005, 13086, 22764, -1911,
+    -972,  12733, 23103, -1937, -939,  12382, 23438, -1962, -907,  12033, 23771, -1986, -875,
+    11686, 24100, -2009, -844,  11342, 24425, -2030, -813,  11000, 24746, -2049, -783,  10662,
+    25063, -2067, -753,  10326, 25375, -2084, -724,  9993,  25683, -2098, -696,  9663,  25987,
+    -2111, -668,  9336,  26285, -2121, -641,  9013,  26579, -2130, -614,  8694,  26867, -2136,
+    -588,  8378,  27151, -2141, -563,  8066,  27428, -2142, -538,  7758,  27700, -2142, -514,
+    7453,  27966, -2139, -490,  7153,  28226, -2133, -468,  6857,  28480, -2125, -445,  6565,
+    28727, -2114, -424,  6277,  28968, -2100, -403,  5994,  29203, -2083, -382,  5716,  29431,
+    -2063, -362,  5442,  29652, -2040, -343,  5172,  29866, -2014, -325,  4908,  30072, -1984,
+    -307,  4648,  30272, -1951, -289,  4393,  30464, -1914, -272,  4143,  30649, -1874, -256,
+    3897,  30826, -1830, -240,  3657,  30995, -1783, -224,  3422,  31157, -1732, -209,  3192,
+    31310, -1676, -194,  2967,  31456, -1617, -180,  2747,  31593, -1554, -167,  2532,  31723,
+    -1486, -153,  2322,  31844, -1414, -140,  2118,  31956, -1338, -128,  1919,  32061, -1258,
+    -115,  1724,  32157, -1173, -103,  1536,  32244, -1084, -92,   1352,  32323, -990,  -80,
+    1174,  32393, -891,  -69,   1000,  32454, -788,  -58,   832,   32507, -680,  -47,   669,
+    32551, -568,  -36,   512,   32586, -450,  -26,   359,   32613, -328,  -15,   212,   32630,
+    -200,  -5,    69,    32639, -68};
+
+constexpr std::array<s16, 512> curve_lut2{
+    3195,  26287, 3329,  -32,   3064,  26281, 3467,  -34,   2936,  26270, 3608,  -38,   2811,
+    26253, 3751,  -42,   2688,  26230, 3897,  -46,   2568,  26202, 4046,  -50,   2451,  26169,
+    4199,  -54,   2338,  26130, 4354,  -58,   2227,  26085, 4512,  -63,   2120,  26035, 4673,
+    -67,   2015,  25980, 4837,  -72,   1912,  25919, 5004,  -76,   1813,  25852, 5174,  -81,
+    1716,  25780, 5347,  -87,   1622,  25704, 5522,  -92,   1531,  25621, 5701,  -98,   1442,
+    25533, 5882,  -103,  1357,  25440, 6066,  -109,  1274,  25342, 6253,  -115,  1193,  25239,
+    6442,  -121,  1115,  25131, 6635,  -127,  1040,  25018, 6830,  -133,  967,   24899, 7027,
+    -140,  897,   24776, 7227,  -146,  829,   24648, 7430,  -153,  764,   24516, 7635,  -159,
+    701,   24379, 7842,  -166,  641,   24237, 8052,  -174,  583,   24091, 8264,  -181,  526,
+    23940, 8478,  -187,  472,   23785, 8695,  -194,  420,   23626, 8914,  -202,  371,   23462,
+    9135,  -209,  324,   23295, 9358,  -215,  279,   23123, 9583,  -222,  236,   22948, 9809,
+    -230,  194,   22769, 10038, -237,  154,   22586, 10269, -243,  117,   22399, 10501, -250,
+    81,    22208, 10735, -258,  47,    22015, 10970, -265,  15,    21818, 11206, -271,  -16,
+    21618, 11444, -277,  -44,   21415, 11684, -283,  -71,   21208, 11924, -290,  -97,   20999,
+    12166, -296,  -121,  20786, 12409, -302,  -143,  20571, 12653, -306,  -163,  20354, 12898,
+    -311,  -183,  20134, 13143, -316,  -201,  19911, 13389, -321,  -218,  19686, 13635, -325,
+    -234,  19459, 13882, -328,  -248,  19230, 14130, -332,  -261,  18998, 14377, -335,  -273,
+    18765, 14625, -337,  -284,  18531, 14873, -339,  -294,  18295, 15121, -341,  -302,  18057,
+    15369, -341,  -310,  17817, 15617, -341,  -317,  17577, 15864, -340,  -323,  17335, 16111,
+    -340,  -328,  17092, 16357, -338,  -332,  16848, 16603, -336,  -336,  16603, 16848, -332,
+    -338,  16357, 17092, -328,  -340,  16111, 17335, -323,  -340,  15864, 17577, -317,  -341,
+    15617, 17817, -310,  -341,  15369, 18057, -302,  -341,  15121, 18295, -294,  -339,  14873,
+    18531, -284,  -337,  14625, 18765, -273,  -335,  14377, 18998, -261,  -332,  14130, 19230,
+    -248,  -328,  13882, 19459, -234,  -325,  13635, 19686, -218,  -321,  13389, 19911, -201,
+    -316,  13143, 20134, -183,  -311,  12898, 20354, -163,  -306,  12653, 20571, -143,  -302,
+    12409, 20786, -121,  -296,  12166, 20999, -97,   -290,  11924, 21208, -71,   -283,  11684,
+    21415, -44,   -277,  11444, 21618, -16,   -271,  11206, 21818, 15,    -265,  10970, 22015,
+    47,    -258,  10735, 22208, 81,    -250,  10501, 22399, 117,   -243,  10269, 22586, 154,
+    -237,  10038, 22769, 194,   -230,  9809,  22948, 236,   -222,  9583,  23123, 279,   -215,
+    9358,  23295, 324,   -209,  9135,  23462, 371,   -202,  8914,  23626, 420,   -194,  8695,
+    23785, 472,   -187,  8478,  23940, 526,   -181,  8264,  24091, 583,   -174,  8052,  24237,
+    641,   -166,  7842,  24379, 701,   -159,  7635,  24516, 764,   -153,  7430,  24648, 829,
+    -146,  7227,  24776, 897,   -140,  7027,  24899, 967,   -133,  6830,  25018, 1040,  -127,
+    6635,  25131, 1115,  -121,  6442,  25239, 1193,  -115,  6253,  25342, 1274,  -109,  6066,
+    25440, 1357,  -103,  5882,  25533, 1442,  -98,   5701,  25621, 1531,  -92,   5522,  25704,
+    1622,  -87,   5347,  25780, 1716,  -81,   5174,  25852, 1813,  -76,   5004,  25919, 1912,
+    -72,   4837,  25980, 2015,  -67,   4673,  26035, 2120,  -63,   4512,  26085, 2227,  -58,
+    4354,  26130, 2338,  -54,   4199,  26169, 2451,  -50,   4046,  26202, 2568,  -46,   3897,
+    26230, 2688,  -42,   3751,  26253, 2811,  -38,   3608,  26270, 2936,  -34,   3467,  26281,
+    3064,  -32,   3329,  26287, 3195};

 std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, double ratio) {
    if (input.size() < 2)
@@ -27,43 +147,51 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,

    if (ratio <= 0) {
        LOG_CRITICAL(Audio, "Nonsensical interpolation ratio {}", ratio);
-        ratio = 1.0;
+        return input;
    }

-    if (ratio != state.current_ratio) {
-        const double cutoff_frequency = std::min(0.5 / ratio, 0.5 * ratio);
-        state.nyquist = CascadingFilter::LowPass(std::clamp(cutoff_frequency, 0.0, 0.4), 3);
-        state.current_ratio = ratio;
-    }
-    state.nyquist.Process(input);
+    const s32 step{static_cast<s32>(ratio * 0x8000)};
+    const std::array<s16, 512>& lut = [step] {
+        if (step > 0xaaaa) {
+            return curve_lut0;
+        }
+        if (step <= 0x8000) {
+            return curve_lut1;
+        }
+        return curve_lut2;
+    }();

-    constexpr std::size_t taps = InterpolationState::lanczos_taps;
-    const std::size_t num_frames = input.size() / 2;
+    const std::size_t num_frames{input.size() / 2};

    std::vector<s16> output;
-    output.reserve(static_cast<std::size_t>(input.size() / ratio + 4));
+    output.reserve(static_cast<std::size_t>(input.size() / ratio + InterpolationState::taps));

-    double& pos = state.position;
-    auto& h = state.history;
-    for (std::size_t i = 0; i < num_frames; ++i) {
-        std::rotate(h.begin(), h.end() - 1, h.end());
-        h[0][0] = input[i * 2 + 0];
-        h[0][1] = input[i * 2 + 1];
+    for (std::size_t frame{}; frame < num_frames; ++frame) {
+        const std::size_t lut_index{(state.fraction >> 8) * InterpolationState::taps};

-        while (pos <= 1.0) {
-            double l = 0.0;
-            double r = 0.0;
-            for (std::size_t j = 0; j < h.size(); j++) {
-                const double lanczos_calc = Lanczos(taps, pos + j - taps + 1);
-                l += lanczos_calc * h[j][0];
-                r += lanczos_calc * h[j][1];
-            }
-            output.emplace_back(static_cast<s16>(std::clamp(l, -32768.0, 32767.0)));
-            output.emplace_back(static_cast<s16>(std::clamp(r, -32768.0, 32767.0)));
+        std::rotate(state.history.begin(), state.history.end() - 1, state.history.end());
+        state.history[0][0] = input[frame * 2 + 0];
+        state.history[0][1] = input[frame * 2 + 1];

-            pos += ratio;
+        while (state.position <= 1.0) {
+            const s32 left{state.history[0][0] * lut[lut_index + 0] +
+                           state.history[1][0] * lut[lut_index + 1] +
+                           state.history[2][0] * lut[lut_index + 2] +
+                           state.history[3][0] * lut[lut_index + 3]};
+            const s32 right{state.history[0][1] * lut[lut_index + 0] +
+                            state.history[1][1] * lut[lut_index + 1] +
+                            state.history[2][1] * lut[lut_index + 2] +
+                            state.history[3][1] * lut[lut_index + 3]};
+            const s32 new_offset{state.fraction + step};
+
+            state.fraction = new_offset & 0x7fff;
+
+            output.emplace_back(static_cast<s16>(std::clamp(left >> 15, SHRT_MIN, SHRT_MAX)));
+            output.emplace_back(static_cast<s16>(std::clamp(right >> 15, SHRT_MIN, SHRT_MAX)));
+
+            state.position += ratio;
        }
-        pos -= 1.0;
+        state.position -= 1.0;
    }

    return output;
@@ -6,19 +6,17 @@

 #include <array>
 #include <vector>
-#include "audio_core/algorithm/filter.h"
+
 #include "common/common_types.h"

 namespace AudioCore {

 struct InterpolationState {
-    static constexpr std::size_t lanczos_taps = 4;
-    static constexpr std::size_t history_size = lanczos_taps * 2 - 1;
-
-    double current_ratio = 0.0;
-    CascadingFilter nyquist;
-    std::array<std::array<s16, 2>, history_size> history = {};
-    double position = 0;
+    static constexpr std::size_t taps{4};
+    static constexpr std::size_t history_size{taps * 2 - 1};
+    std::array<std::array<s16, 2>, history_size> history{};
+    double position{};
+    s32 fraction{};
 };

 /// Interpolates input signal to produce output signal.
@@ -131,8 +131,8 @@ add_library(core STATIC
    frontend/framebuffer_layout.cpp
    frontend/framebuffer_layout.h
    frontend/input.h
-    frontend/scope_acquire_window_context.cpp
-    frontend/scope_acquire_window_context.h
+    frontend/scope_acquire_context.cpp
+    frontend/scope_acquire_context.h
    gdbstub/gdbstub.cpp
    gdbstub/gdbstub.h
    hardware_interrupt_manager.cpp
@@ -187,6 +187,8 @@ add_library(core STATIC
    hle/kernel/synchronization.h
    hle/kernel/thread.cpp
    hle/kernel/thread.h
+    hle/kernel/time_manager.cpp
+    hle/kernel/time_manager.h
    hle/kernel/transfer_memory.cpp
    hle/kernel/transfer_memory.h
    hle/kernel/vm_manager.cpp
@@ -593,8 +595,12 @@ endif()

 if (ARCHITECTURE_x86_64)
    target_sources(core PRIVATE
-        arm/dynarmic/arm_dynarmic.cpp
-        arm/dynarmic/arm_dynarmic.h
+        arm/dynarmic/arm_dynarmic_32.cpp
+        arm/dynarmic/arm_dynarmic_32.h
+        arm/dynarmic/arm_dynarmic_64.cpp
+        arm/dynarmic/arm_dynarmic_64.h
+        arm/dynarmic/arm_dynarmic_cp15.cpp
+        arm/dynarmic/arm_dynarmic_cp15.h
    )
    target_link_libraries(core PRIVATE dynarmic)
 endif()
@@ -25,7 +25,20 @@ public:
    explicit ARM_Interface(System& system_) : system{system_} {}
    virtual ~ARM_Interface() = default;

-    struct ThreadContext {
+    struct ThreadContext32 {
+        std::array<u32, 16> cpu_registers;
+        u32 cpsr;
+        std::array<u8, 4> padding;
+        std::array<u64, 32> fprs;
+        u32 fpscr;
+        u32 fpexc;
+        u32 tpidr;
+    };
+    // Internally within the kernel, it expects the AArch32 version of the
+    // thread context to be 344 bytes in size.
+    static_assert(sizeof(ThreadContext32) == 0x158);
+
+    struct ThreadContext64 {
        std::array<u64, 31> cpu_registers;
        u64 sp;
        u64 pc;
@@ -38,7 +51,7 @@ public:
    };
    // Internally within the kernel, it expects the AArch64 version of the
    // thread context to be 800 bytes in size.
-    static_assert(sizeof(ThreadContext) == 0x320);
+    static_assert(sizeof(ThreadContext64) == 0x320);

    /// Runs the CPU until an event happens
    virtual void Run() = 0;
@@ -130,17 +143,10 @@ public:
     */
    virtual void SetTPIDR_EL0(u64 value) = 0;

-    /**
-     * Saves the current CPU context
-     * @param ctx Thread context to save
-     */
-    virtual void SaveContext(ThreadContext& ctx) = 0;
-
-    /**
-     * Loads a CPU context
-     * @param ctx Thread context to load
-     */
-    virtual void LoadContext(const ThreadContext& ctx) = 0;
+    virtual void SaveContext(ThreadContext32& ctx) = 0;
+    virtual void SaveContext(ThreadContext64& ctx) = 0;
+    virtual void LoadContext(const ThreadContext32& ctx) = 0;
+    virtual void LoadContext(const ThreadContext64& ctx) = 0;

    /// Clears the exclusive monitor's state.
    virtual void ClearExclusiveState() = 0;
@@ -0,0 +1,208 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+#include <memory>
+#include <dynarmic/A32/a32.h>
+#include <dynarmic/A32/config.h>
+#include <dynarmic/A32/context.h>
+#include "common/microprofile.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_manager.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/svc.h"
+#include "core/memory.h"
+
+namespace Core {
+
+class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks {
+public:
+    explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent) : parent(parent) {}
+
+    u8 MemoryRead8(u32 vaddr) override {
+        return parent.system.Memory().Read8(vaddr);
+    }
+    u16 MemoryRead16(u32 vaddr) override {
+        return parent.system.Memory().Read16(vaddr);
+    }
+    u32 MemoryRead32(u32 vaddr) override {
+        return parent.system.Memory().Read32(vaddr);
+    }
+    u64 MemoryRead64(u32 vaddr) override {
+        return parent.system.Memory().Read64(vaddr);
+    }
+
+    void MemoryWrite8(u32 vaddr, u8 value) override {
+        parent.system.Memory().Write8(vaddr, value);
+    }
+    void MemoryWrite16(u32 vaddr, u16 value) override {
+        parent.system.Memory().Write16(vaddr, value);
+    }
+    void MemoryWrite32(u32 vaddr, u32 value) override {
+        parent.system.Memory().Write32(vaddr, value);
+    }
+    void MemoryWrite64(u32 vaddr, u64 value) override {
+        parent.system.Memory().Write64(vaddr, value);
+    }
+
+    void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
+        UNIMPLEMENTED();
+    }
+
+    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
+        switch (exception) {
+        case Dynarmic::A32::Exception::UndefinedInstruction:
+        case Dynarmic::A32::Exception::UnpredictableInstruction:
+            break;
+        case Dynarmic::A32::Exception::Breakpoint:
+            break;
+        }
+        LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+                     static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
+        UNIMPLEMENTED();
+    }
+
+    void CallSVC(u32 swi) override {
+        Kernel::CallSVC(parent.system, swi);
+    }
+
+    void AddTicks(u64 ticks) override {
+        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
+        // rough approximation of the amount of executed ticks in the system, it may be thrown off
+        // if not all cores are doing a similar amount of work. Instead of doing this, we should
+        // device a way so that timing is consistent across all cores without increasing the ticks 4
+        // times.
+        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        // Always execute at least one tick.
+        amortized_ticks = std::max<u64>(amortized_ticks, 1);
+
+        parent.system.CoreTiming().AddTicks(amortized_ticks);
+        num_interpreted_instructions = 0;
+    }
+    u64 GetTicksRemaining() override {
+        return std::max(parent.system.CoreTiming().GetDowncount(), {});
+    }
+
+    ARM_Dynarmic_32& parent;
+    std::size_t num_interpreted_instructions{};
+    u64 tpidrro_el0{};
+    u64 tpidr_el0{};
+};
+
+std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
+                                                             std::size_t address_space_bits) const {
+    Dynarmic::A32::UserConfig config;
+    config.callbacks = cb.get();
+    // TODO(bunnei): Implement page table for 32-bit
+    // config.page_table = &page_table.pointers;
+    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.define_unpredictable_behaviour = true;
+    return std::make_unique<Dynarmic::A32::Jit>(config);
+}
+
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_32, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
+
+void ARM_Dynarmic_32::Run() {
+    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_32);
+    jit->Run();
+}
+
+void ARM_Dynarmic_32::Step() {
+    cb->InterpreterFallback(jit->Regs()[15], 1);
+}
+
+ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
+                                 std::size_t core_index)
+    : ARM_Interface{system},
+      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
+
+ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
+
+void ARM_Dynarmic_32::SetPC(u64 pc) {
+    jit->Regs()[15] = static_cast<u32>(pc);
+}
+
+u64 ARM_Dynarmic_32::GetPC() const {
+    return jit->Regs()[15];
+}
+
+u64 ARM_Dynarmic_32::GetReg(int index) const {
+    return jit->Regs()[index];
+}
+
+void ARM_Dynarmic_32::SetReg(int index, u64 value) {
+    jit->Regs()[index] = static_cast<u32>(value);
+}
+
+u128 ARM_Dynarmic_32::GetVectorReg(int index) const {
+    return {};
+}
+
+void ARM_Dynarmic_32::SetVectorReg(int index, u128 value) {}
+
+u32 ARM_Dynarmic_32::GetPSTATE() const {
+    return jit->Cpsr();
+}
+
+void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
+    jit->SetCpsr(cpsr);
+}
+
+u64 ARM_Dynarmic_32::GetTlsAddress() const {
+    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+}
+
+void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
+    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+}
+
+u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
+    return cb->tpidr_el0;
+}
+
+void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
+    cb->tpidr_el0 = value;
+}
+
+void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
+    Dynarmic::A32::Context context;
+    jit->SaveContext(context);
+    ctx.cpu_registers = context.Regs();
+    ctx.cpsr = context.Cpsr();
+}
+
+void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) {
+    Dynarmic::A32::Context context;
+    context.Regs() = ctx.cpu_registers;
+    context.SetCpsr(ctx.cpsr);
+    jit->LoadContext(context);
+}
+
+void ARM_Dynarmic_32::PrepareReschedule() {
+    jit->HaltExecution();
+}
+
+void ARM_Dynarmic_32::ClearInstructionCache() {
+    jit->ClearCache();
+}
+
+void ARM_Dynarmic_32::ClearExclusiveState() {}
+
+void ARM_Dynarmic_32::PageTableChanged(Common::PageTable& page_table,
+                                       std::size_t new_address_space_size_in_bits) {
+    auto key = std::make_pair(&page_table, new_address_space_size_in_bits);
+    auto iter = jit_cache.find(key);
+    if (iter != jit_cache.end()) {
+        jit = iter->second;
+        return;
+    }
+    jit = MakeJit(page_table, new_address_space_size_in_bits);
+    jit_cache.emplace(key, jit);
+}
+
+} // namespace Core
@@ -0,0 +1,77 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include <dynarmic/A32/a32.h>
+#include <dynarmic/A64/a64.h>
+#include <dynarmic/A64/exclusive_monitor.h>
+#include "common/common_types.h"
+#include "common/hash.h"
+#include "core/arm/arm_interface.h"
+#include "core/arm/exclusive_monitor.h"
+
+namespace Memory {
+class Memory;
+}
+
+namespace Core {
+
+class DynarmicCallbacks32;
+class DynarmicExclusiveMonitor;
+class System;
+
+class ARM_Dynarmic_32 final : public ARM_Interface {
+public:
+    ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ~ARM_Dynarmic_32() override;
+
+    void SetPC(u64 pc) override;
+    u64 GetPC() const override;
+    u64 GetReg(int index) const override;
+    void SetReg(int index, u64 value) override;
+    u128 GetVectorReg(int index) const override;
+    void SetVectorReg(int index, u128 value) override;
+    u32 GetPSTATE() const override;
+    void SetPSTATE(u32 pstate) override;
+    void Run() override;
+    void Step() override;
+    VAddr GetTlsAddress() const override;
+    void SetTlsAddress(VAddr address) override;
+    void SetTPIDR_EL0(u64 value) override;
+    u64 GetTPIDR_EL0() const override;
+
+    void SaveContext(ThreadContext32& ctx) override;
+    void SaveContext(ThreadContext64& ctx) override {}
+    void LoadContext(const ThreadContext32& ctx) override;
+    void LoadContext(const ThreadContext64& ctx) override {}
+
+    void PrepareReschedule() override;
+    void ClearExclusiveState() override;
+
+    void ClearInstructionCache() override;
+    void PageTableChanged(Common::PageTable& new_page_table,
+                          std::size_t new_address_space_size_in_bits) override;
+
+private:
+    std::shared_ptr<Dynarmic::A32::Jit> MakeJit(Common::PageTable& page_table,
+                                                std::size_t address_space_bits) const;
+
+    using JitCacheKey = std::pair<Common::PageTable*, std::size_t>;
+    using JitCacheType =
+        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;
+
+    friend class DynarmicCallbacks32;
+    std::unique_ptr<DynarmicCallbacks32> cb;
+    JitCacheType jit_cache;
+    std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::size_t core_index;
+    DynarmicExclusiveMonitor& exclusive_monitor;
+    std::array<u32, 84> CP15_regs{};
+};
+
+} // namespace Core
@@ -8,7 +8,7 @@
 #include <dynarmic/A64/config.h>
 #include "common/logging/log.h"
 #include "common/microprofile.h"
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #include "core/core.h"
 #include "core/core_manager.h"
 #include "core/core_timing.h"
@@ -25,9 +25,9 @@ namespace Core {

 using Vector = Dynarmic::A64::Vector;

-class ARM_Dynarmic_Callbacks : public Dynarmic::A64::UserCallbacks {
+class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks {
 public:
-    explicit ARM_Dynarmic_Callbacks(ARM_Dynarmic& parent) : parent(parent) {}
+    explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent) : parent(parent) {}

    u8 MemoryRead8(u64 vaddr) override {
        return parent.system.Memory().Read8(vaddr);
@@ -68,7 +68,7 @@ public:
        LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc,
                 num_instructions, MemoryReadCode(pc));

-        ARM_Interface::ThreadContext ctx;
+        ARM_Interface::ThreadContext64 ctx;
        parent.SaveContext(ctx);
        parent.inner_unicorn.LoadContext(ctx);
        parent.inner_unicorn.ExecuteInstructions(num_instructions);
@@ -90,7 +90,7 @@ public:
                parent.jit->HaltExecution();
                parent.SetPC(pc);
                Kernel::Thread* const thread = parent.system.CurrentScheduler().GetCurrentThread();
-                parent.SaveContext(thread->GetContext());
+                parent.SaveContext(thread->GetContext64());
                GDBStub::Break();
                GDBStub::SendTrap(thread, 5);
                return;
@@ -126,14 +126,14 @@ public:
        return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
    }

-    ARM_Dynarmic& parent;
+    ARM_Dynarmic_64& parent;
    std::size_t num_interpreted_instructions = 0;
    u64 tpidrro_el0 = 0;
    u64 tpidr_el0 = 0;
 };

-std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& page_table,
-                                                          std::size_t address_space_bits) const {
+std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable& page_table,
+                                                             std::size_t address_space_bits) const {
    Dynarmic::A64::UserConfig config;

    // Callbacks
@@ -159,79 +159,79 @@ std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& pag
    // Unpredictable instructions
    config.define_unpredictable_behaviour = true;

-    return std::make_unique<Dynarmic::A64::Jit>(config);
+    return std::make_shared<Dynarmic::A64::Jit>(config);
 }

-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_64, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));

-void ARM_Dynarmic::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic);
+void ARM_Dynarmic_64::Run() {
+    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_64);

    jit->Run();
 }

-void ARM_Dynarmic::Step() {
+void ARM_Dynarmic_64::Step() {
    cb->InterpreterFallback(jit->GetPC(), 1);
 }

-ARM_Dynarmic::ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor,
-                           std::size_t core_index)
+ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor,
+                                 std::size_t core_index)
    : ARM_Interface{system},
-      cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system},
+      cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system},
      core_index{core_index}, exclusive_monitor{
                                  dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

-ARM_Dynarmic::~ARM_Dynarmic() = default;
+ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;

-void ARM_Dynarmic::SetPC(u64 pc) {
+void ARM_Dynarmic_64::SetPC(u64 pc) {
    jit->SetPC(pc);
 }

-u64 ARM_Dynarmic::GetPC() const {
+u64 ARM_Dynarmic_64::GetPC() const {
    return jit->GetPC();
 }

-u64 ARM_Dynarmic::GetReg(int index) const {
+u64 ARM_Dynarmic_64::GetReg(int index) const {
    return jit->GetRegister(index);
 }

-void ARM_Dynarmic::SetReg(int index, u64 value) {
+void ARM_Dynarmic_64::SetReg(int index, u64 value) {
    jit->SetRegister(index, value);
 }

-u128 ARM_Dynarmic::GetVectorReg(int index) const {
+u128 ARM_Dynarmic_64::GetVectorReg(int index) const {
    return jit->GetVector(index);
 }

-void ARM_Dynarmic::SetVectorReg(int index, u128 value) {
+void ARM_Dynarmic_64::SetVectorReg(int index, u128 value) {
    jit->SetVector(index, value);
 }

-u32 ARM_Dynarmic::GetPSTATE() const {
+u32 ARM_Dynarmic_64::GetPSTATE() const {
    return jit->GetPstate();
 }

-void ARM_Dynarmic::SetPSTATE(u32 pstate) {
+void ARM_Dynarmic_64::SetPSTATE(u32 pstate) {
    jit->SetPstate(pstate);
 }

-u64 ARM_Dynarmic::GetTlsAddress() const {
+u64 ARM_Dynarmic_64::GetTlsAddress() const {
    return cb->tpidrro_el0;
 }

-void ARM_Dynarmic::SetTlsAddress(VAddr address) {
+void ARM_Dynarmic_64::SetTlsAddress(VAddr address) {
    cb->tpidrro_el0 = address;
 }

-u64 ARM_Dynarmic::GetTPIDR_EL0() const {
+u64 ARM_Dynarmic_64::GetTPIDR_EL0() const {
    return cb->tpidr_el0;
 }

-void ARM_Dynarmic::SetTPIDR_EL0(u64 value) {
+void ARM_Dynarmic_64::SetTPIDR_EL0(u64 value) {
    cb->tpidr_el0 = value;
 }

-void ARM_Dynarmic::SaveContext(ThreadContext& ctx) {
+void ARM_Dynarmic_64::SaveContext(ThreadContext64& ctx) {
    ctx.cpu_registers = jit->GetRegisters();
    ctx.sp = jit->GetSP();
    ctx.pc = jit->GetPC();
@@ -242,7 +242,7 @@ void ARM_Dynarmic::SaveContext(ThreadContext& ctx) {
    ctx.tpidr = cb->tpidr_el0;
 }

-void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) {
+void ARM_Dynarmic_64::LoadContext(const ThreadContext64& ctx) {
    jit->SetRegisters(ctx.cpu_registers);
    jit->SetSP(ctx.sp);
    jit->SetPC(ctx.pc);
@@ -253,25 +253,32 @@ void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) {
    SetTPIDR_EL0(ctx.tpidr);
 }

-void ARM_Dynarmic::PrepareReschedule() {
+void ARM_Dynarmic_64::PrepareReschedule() {
    jit->HaltExecution();
 }

-void ARM_Dynarmic::ClearInstructionCache() {
+void ARM_Dynarmic_64::ClearInstructionCache() {
    jit->ClearCache();
 }

-void ARM_Dynarmic::ClearExclusiveState() {
+void ARM_Dynarmic_64::ClearExclusiveState() {
    jit->ClearExclusiveState();
 }

-void ARM_Dynarmic::PageTableChanged(Common::PageTable& page_table,
-                                    std::size_t new_address_space_size_in_bits) {
+void ARM_Dynarmic_64::PageTableChanged(Common::PageTable& page_table,
+                                       std::size_t new_address_space_size_in_bits) {
+    auto key = std::make_pair(&page_table, new_address_space_size_in_bits);
+    auto iter = jit_cache.find(key);
+    if (iter != jit_cache.end()) {
+        jit = iter->second;
+        return;
+    }
    jit = MakeJit(page_table, new_address_space_size_in_bits);
+    jit_cache.emplace(key, jit);
 }

-DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count)
-    : monitor(core_count), memory{memory_} {}
+DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
+    : monitor(core_count), memory{memory} {}

 DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;

@@ -5,9 +5,12 @@
 #pragma once

 #include <memory>
+#include <unordered_map>
+
 #include <dynarmic/A64/a64.h>
 #include <dynarmic/A64/exclusive_monitor.h>
 #include "common/common_types.h"
+#include "common/hash.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
@@ -18,14 +21,14 @@ class Memory;

 namespace Core {

-class ARM_Dynarmic_Callbacks;
+class DynarmicCallbacks64;
 class DynarmicExclusiveMonitor;
 class System;

-class ARM_Dynarmic final : public ARM_Interface {
+class ARM_Dynarmic_64 final : public ARM_Interface {
 public:
-    ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
-    ~ARM_Dynarmic() override;
+    ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ~ARM_Dynarmic_64() override;

    void SetPC(u64 pc) override;
    u64 GetPC() const override;
@@ -42,8 +45,10 @@ public:
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;

-    void SaveContext(ThreadContext& ctx) override;
-    void LoadContext(const ThreadContext& ctx) override;
+    void SaveContext(ThreadContext32& ctx) override {}
+    void SaveContext(ThreadContext64& ctx) override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;

    void PrepareReschedule() override;
    void ClearExclusiveState() override;
@@ -53,12 +58,17 @@ public:
                          std::size_t new_address_space_size_in_bits) override;

 private:
-    std::unique_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table,
+    std::shared_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table,
                                                std::size_t address_space_bits) const;

-    friend class ARM_Dynarmic_Callbacks;
-    std::unique_ptr<ARM_Dynarmic_Callbacks> cb;
-    std::unique_ptr<Dynarmic::A64::Jit> jit;
+    using JitCacheKey = std::pair<Common::PageTable*, std::size_t>;
+    using JitCacheType =
+        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A64::Jit>, Common::PairHash>;
+
+    friend class DynarmicCallbacks64;
+    std::unique_ptr<DynarmicCallbacks64> cb;
+    JitCacheType jit_cache;
+    std::shared_ptr<Dynarmic::A64::Jit> jit;
    ARM_Unicorn inner_unicorn;

    std::size_t core_index;
@@ -67,7 +77,7 @@ private:

 class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
 public:
-    explicit DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count);
+    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
    ~DynarmicExclusiveMonitor() override;

    void SetExclusive(std::size_t core_index, VAddr addr) override;
@@ -80,7 +90,7 @@ public:
    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;

 private:
-    friend class ARM_Dynarmic;
+    friend class ARM_Dynarmic_64;
    Dynarmic::A64::ExclusiveMonitor monitor;
    Memory::Memory& memory;
 };
@@ -0,0 +1,80 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+
+using Callback = Dynarmic::A32::Coprocessor::Callback;
+using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
+using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;
+
+std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
+                                                               CoprocReg CRd, CoprocReg CRn,
+                                                               CoprocReg CRm, unsigned opc2) {
+    return {};
+}
+
+CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                                         CoprocReg CRm, unsigned opc2) {
+    // TODO(merry): Privileged CP15 registers
+
+    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // This is a dummy write, we ignore the value written here.
+        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+    }
+
+    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
+        switch (opc2) {
+        case 4:
+            // This is a dummy write, we ignore the value written here.
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+        case 5:
+            // This is a dummy write, we ignore the value written here.
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
+        default:
+            return {};
+        }
+    }
+
+    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
+        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+    }
+
+    return {};
+}
+
+CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    return {};
+}
+
+CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                                        CoprocReg CRm, unsigned opc2) {
+    // TODO(merry): Privileged CP15 registers
+
+    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
+        switch (opc2) {
+        case 2:
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        case 3:
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+        default:
+            return {};
+        }
+    }
+
+    return {};
+}
+
+CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    return {};
+}
+
+std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
+                                                       std::optional<u8> option) {
+    return {};
+}
+
+std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
+                                                        std::optional<u8> option) {
+    return {};
+}
@@ -0,0 +1,152 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+
+#include <dynarmic/A32/coprocessor.h>
+#include "common/common_types.h"
+
+enum class CP15Register {
+    // c0 - Information registers
+    CP15_MAIN_ID,
+    CP15_CACHE_TYPE,
+    CP15_TCM_STATUS,
+    CP15_TLB_TYPE,
+    CP15_CPU_ID,
+    CP15_PROCESSOR_FEATURE_0,
+    CP15_PROCESSOR_FEATURE_1,
+    CP15_DEBUG_FEATURE_0,
+    CP15_AUXILIARY_FEATURE_0,
+    CP15_MEMORY_MODEL_FEATURE_0,
+    CP15_MEMORY_MODEL_FEATURE_1,
+    CP15_MEMORY_MODEL_FEATURE_2,
+    CP15_MEMORY_MODEL_FEATURE_3,
+    CP15_ISA_FEATURE_0,
+    CP15_ISA_FEATURE_1,
+    CP15_ISA_FEATURE_2,
+    CP15_ISA_FEATURE_3,
+    CP15_ISA_FEATURE_4,
+
+    // c1 - Control registers
+    CP15_CONTROL,
+    CP15_AUXILIARY_CONTROL,
+    CP15_COPROCESSOR_ACCESS_CONTROL,
+
+    // c2 - Translation table registers
+    CP15_TRANSLATION_BASE_TABLE_0,
+    CP15_TRANSLATION_BASE_TABLE_1,
+    CP15_TRANSLATION_BASE_CONTROL,
+    CP15_DOMAIN_ACCESS_CONTROL,
+    CP15_RESERVED,
+
+    // c5 - Fault status registers
+    CP15_FAULT_STATUS,
+    CP15_INSTR_FAULT_STATUS,
+    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
+    CP15_INST_FSR,
+
+    // c6 - Fault Address registers
+    CP15_FAULT_ADDRESS,
+    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
+    CP15_WFAR,
+    CP15_IFAR,
+
+    // c7 - Cache operation registers
+    CP15_WAIT_FOR_INTERRUPT,
+    CP15_PHYS_ADDRESS,
+    CP15_INVALIDATE_INSTR_CACHE,
+    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
+    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
+    CP15_FLUSH_PREFETCH_BUFFER,
+    CP15_FLUSH_BRANCH_TARGET_CACHE,
+    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
+    CP15_INVALIDATE_DATA_CACHE,
+    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
+    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
+    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
+    CP15_CLEAN_DATA_CACHE,
+    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
+    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
+    CP15_DATA_SYNC_BARRIER,
+    CP15_DATA_MEMORY_BARRIER,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
+
+    // c8 - TLB operations
+    CP15_INVALIDATE_ITLB,
+    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
+    CP15_INVALIDATE_DTLB,
+    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
+    CP15_INVALIDATE_UTLB,
+    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
+
+    // c9 - Data cache lockdown register
+    CP15_DATA_CACHE_LOCKDOWN,
+
+    // c10 - TLB/Memory map registers
+    CP15_TLB_LOCKDOWN,
+    CP15_PRIMARY_REGION_REMAP,
+    CP15_NORMAL_REGION_REMAP,
+
+    // c13 - Thread related registers
+    CP15_PID,
+    CP15_CONTEXT_ID,
+    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
+    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
+    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
+
+    // c15 - Performance and TLB lockdown registers
+    CP15_PERFORMANCE_MONITOR_CONTROL,
+    CP15_CYCLE_COUNTER,
+    CP15_COUNT_0,
+    CP15_COUNT_1,
+    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
+    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
+    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
+    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
+    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
+    CP15_TLB_DEBUG_CONTROL,
+
+    // Skyeye defined
+    CP15_TLB_FAULT_ADDR,
+    CP15_TLB_FAULT_STATUS,
+
+    // Not an actual register.
+    // All registers should be defined above this.
+    CP15_REGISTER_COUNT,
+};
+
+class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
+public:
+    using CoprocReg = Dynarmic::A32::CoprocReg;
+
+    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+
+    std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
+                                                     CoprocReg CRn, CoprocReg CRm,
+                                                     unsigned opc2) override;
+    CallbackOrAccessOneWord CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                               CoprocReg CRm, unsigned opc2) override;
+    CallbackOrAccessTwoWords CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) override;
+    CallbackOrAccessOneWord CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm,
+                                              unsigned opc2) override;
+    CallbackOrAccessTwoWords CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) override;
+    std::optional<Callback> CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
+                                             std::optional<u8> option) override;
+    std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
+                                              std::optional<u8> option) override;
+
+private:
+    u32* CP15{};
+};
@@ -3,7 +3,7 @@
 // Refer to the license.txt file included.

 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/memory.h"
@@ -53,7 +53,7 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si
                               void* user_data) {
    auto* const system = static_cast<System*>(user_data);

-    ARM_Interface::ThreadContext ctx{};
+    ARM_Interface::ThreadContext64 ctx{};
    system->CurrentArmInterface().SaveContext(ctx);
    ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr,
               ctx.pc, ctx.cpu_registers[30]);
@@ -179,7 +179,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
        }

        Kernel::Thread* const thread = system.CurrentScheduler().GetCurrentThread();
-        SaveContext(thread->GetContext());
+        SaveContext(thread->GetContext64());
        if (last_bkpt_hit || GDBStub::IsMemoryBreak() || GDBStub::GetCpuStepFlag()) {
            last_bkpt_hit = false;
            GDBStub::Break();
@@ -188,7 +188,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
    }
 }

-void ARM_Unicorn::SaveContext(ThreadContext& ctx) {
+void ARM_Unicorn::SaveContext(ThreadContext64& ctx) {
    int uregs[32];
    void* tregs[32];

@@ -215,7 +215,7 @@ void ARM_Unicorn::SaveContext(ThreadContext& ctx) {
    CHECKED(uc_reg_read_batch(uc, uregs, tregs, 32));
 }

-void ARM_Unicorn::LoadContext(const ThreadContext& ctx) {
+void ARM_Unicorn::LoadContext(const ThreadContext64& ctx) {
    int uregs[32];
    void* tregs[32];

@@ -30,8 +30,6 @@ public:
    void SetTlsAddress(VAddr address) override;
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;
-    void SaveContext(ThreadContext& ctx) override;
-    void LoadContext(const ThreadContext& ctx) override;
    void PrepareReschedule() override;
    void ClearExclusiveState() override;
    void ExecuteInstructions(std::size_t num_instructions);
@@ -41,6 +39,11 @@ public:
    void PageTableChanged(Common::PageTable&, std::size_t) override {}
    void RecordBreak(GDBStub::BreakpointAddress bkpt);

+    void SaveContext(ThreadContext32& ctx) override {}
+    void SaveContext(ThreadContext64& ctx) override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;
+
 private:
    static void InterruptHook(uc_engine* uc, u32 int_no, void* user_data);

@@ -24,6 +24,7 @@
 #include "core/file_sys/sdmc_factory.h"
 #include "core/file_sys/vfs_concat.h"
 #include "core/file_sys/vfs_real.h"
+#include "core/frontend/scope_acquire_context.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hardware_interrupt_manager.h"
 #include "core/hle/kernel/client_port.h"
@@ -184,6 +185,8 @@ struct System::Impl {

    ResultStatus Load(System& system, Frontend::EmuWindow& emu_window,
                      const std::string& filepath) {
+        Core::Frontend::ScopeAcquireContext acquire_context{emu_window};
+
        app_loader = Loader::GetLoader(GetGameFileFromPath(virtual_filesystem, filepath));
        if (!app_loader) {
            LOG_CRITICAL(Core, "Failed to obtain loader for {}!", filepath);
@@ -707,4 +710,12 @@ const Service::SM::ServiceManager& System::ServiceManager() const {
    return *impl->service_manager;
 }

+void System::RegisterCoreThread(std::size_t id) {
+    impl->kernel.RegisterCoreThread(id);
+}
+
+void System::RegisterHostThread() {
+    impl->kernel.RegisterHostThread();
+}
+
 } // namespace Core
@@ -360,6 +360,12 @@ public:

    const CurrentBuildProcessID& GetCurrentProcessBuildID() const;

+    /// Register a host thread as an emulated CPU Core.
+    void RegisterCoreThread(std::size_t id);
+
+    /// Register a host thread as an auxiliary thread.
+    void RegisterHostThread();
+
 private:
    System();

@@ -6,9 +6,6 @@
 #include <mutex>

 #include "common/logging/log.h"
-#ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
-#endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
@@ -26,9 +26,6 @@ public:

    /// Releases (dunno if this is the "right" word) the context from the caller thread
    virtual void DoneCurrent() = 0;
-
-    /// Swap buffers to display the next frame
-    virtual void SwapBuffers() = 0;
 };

 /**
@@ -27,9 +27,9 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height) {
    // so just calculate them both even if the other isn't showing.
    FramebufferLayout res{width, height};

-    const float emulation_aspect_ratio{static_cast<float>(ScreenUndocked::Height) /
-                                       ScreenUndocked::Width};
-    const auto window_aspect_ratio = static_cast<float>(height) / width;
+    const float window_aspect_ratio = static_cast<float>(height) / width;
+    const float emulation_aspect_ratio = EmulationAspectRatio(
+        static_cast<AspectRatio>(Settings::values.aspect_ratio), window_aspect_ratio);

    const Common::Rectangle<u32> screen_window_area{0, 0, width, height};
    Common::Rectangle<u32> screen = MaxRectangle(screen_window_area, emulation_aspect_ratio);
@@ -58,4 +58,19 @@ FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) {
    return DefaultFrameLayout(width, height);
 }

+float EmulationAspectRatio(AspectRatio aspect, float window_aspect_ratio) {
+    switch (aspect) {
+    case AspectRatio::Default:
+        return static_cast<float>(ScreenUndocked::Height) / ScreenUndocked::Width;
+    case AspectRatio::R4_3:
+        return 3.0f / 4.0f;
+    case AspectRatio::R21_9:
+        return 9.0f / 21.0f;
+    case AspectRatio::StretchToWindow:
+        return window_aspect_ratio;
+    default:
+        return static_cast<float>(ScreenUndocked::Height) / ScreenUndocked::Width;
+    }
+}
+
 } // namespace Layout
@@ -18,10 +18,18 @@ enum ScreenDocked : u32 {
    HeightDocked = 1080,
 };

+enum class AspectRatio {
+    Default,
+    R4_3,
+    R21_9,
+    StretchToWindow,
+};
+
 /// Describes the layout of the window framebuffer
 struct FramebufferLayout {
    u32 width{ScreenUndocked::Width};
    u32 height{ScreenUndocked::Height};
+    bool is_srgb{};

    Common::Rectangle<u32> screen;

@@ -48,4 +56,12 @@ FramebufferLayout DefaultFrameLayout(u32 width, u32 height);
 */
 FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale);

+/**
+ * Convenience method to determine emulation aspect ratio
+ * @param aspect Represents the index of aspect ratio stored in Settings::values.aspect_ratio
+ * @param window_aspect_ratio Current window aspect ratio
+ * @return Emulation render window aspect ratio
+ */
+float EmulationAspectRatio(AspectRatio aspect, float window_aspect_ratio);
+
 } // namespace Layout
@@ -0,0 +1,18 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/frontend/emu_window.h"
+#include "core/frontend/scope_acquire_context.h"
+
+namespace Core::Frontend {
+
+ScopeAcquireContext::ScopeAcquireContext(Core::Frontend::GraphicsContext& context)
+    : context{context} {
+    context.MakeCurrent();
+}
+ScopeAcquireContext::~ScopeAcquireContext() {
+    context.DoneCurrent();
+}
+
+} // namespace Core::Frontend
@@ -8,16 +8,16 @@

 namespace Core::Frontend {

-class EmuWindow;
+class GraphicsContext;

 /// Helper class to acquire/release window context within a given scope
-class ScopeAcquireWindowContext : NonCopyable {
+class ScopeAcquireContext : NonCopyable {
 public:
-    explicit ScopeAcquireWindowContext(Core::Frontend::EmuWindow& window);
-    ~ScopeAcquireWindowContext();
+    explicit ScopeAcquireContext(Core::Frontend::GraphicsContext& context);
+    ~ScopeAcquireContext();

 private:
-    Core::Frontend::EmuWindow& emu_window;
+    Core::Frontend::GraphicsContext& context;
 };

 } // namespace Core::Frontend
@@ -1,18 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "core/frontend/emu_window.h"
-#include "core/frontend/scope_acquire_window_context.h"
-
-namespace Core::Frontend {
-
-ScopeAcquireWindowContext::ScopeAcquireWindowContext(Core::Frontend::EmuWindow& emu_window_)
-    : emu_window{emu_window_} {
-    emu_window.MakeCurrent();
-}
-ScopeAcquireWindowContext::~ScopeAcquireWindowContext() {
-    emu_window.DoneCurrent();
-}
-
-} // namespace Core::Frontend
@@ -217,7 +217,7 @@ static u64 RegRead(std::size_t id, Kernel::Thread* thread = nullptr) {
        return 0;
    }

-    const auto& thread_context = thread->GetContext();
+    const auto& thread_context = thread->GetContext64();

    if (id < SP_REGISTER) {
        return thread_context.cpu_registers[id];
@@ -239,7 +239,7 @@ static void RegWrite(std::size_t id, u64 val, Kernel::Thread* thread = nullptr)
        return;
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id < SP_REGISTER) {
        thread_context.cpu_registers[id] = val;
@@ -259,7 +259,7 @@ static u128 FpuRead(std::size_t id, Kernel::Thread* thread = nullptr) {
        return u128{0};
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) {
        return thread_context.vector_registers[id - UC_ARM64_REG_Q0];
@@ -275,7 +275,7 @@ static void FpuWrite(std::size_t id, u128 val, Kernel::Thread* thread = nullptr)
        return;
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) {
        thread_context.vector_registers[id - UC_ARM64_REG_Q0] = val;
@@ -916,7 +916,7 @@ static void WriteRegister() {
    // Update ARM context, skipping scheduler - no running threads at this point
    Core::System::GetInstance()
        .ArmInterface(current_core)
-        .LoadContext(current_thread->GetContext());
+        .LoadContext(current_thread->GetContext64());

    SendReply("OK");
 }
@@ -947,7 +947,7 @@ static void WriteRegisters() {
    // Update ARM context, skipping scheduler - no running threads at this point
    Core::System::GetInstance()
        .ArmInterface(current_core)
-        .LoadContext(current_thread->GetContext());
+        .LoadContext(current_thread->GetContext64());

    SendReply("OK");
 }
@@ -1019,7 +1019,7 @@ static void Step() {
        // Update ARM context, skipping scheduler - no running threads at this point
        Core::System::GetInstance()
            .ArmInterface(current_core)
-            .LoadContext(current_thread->GetContext());
+            .LoadContext(current_thread->GetContext64());
    }
    step_loop = true;
    halt_loop = true;
@@ -20,6 +20,8 @@ constexpr u32 NUM_CPU_CORES = 4;            // Number of CPU Cores

 } // namespace Hardware

+constexpr u32 INVALID_HOST_THREAD_ID = 0xFFFFFFFF;
+
 struct EmuThreadHandle {
    u32 host_handle;
    u32 guest_handle;
@@ -3,9 +3,12 @@
 // Refer to the license.txt file included.

 #include <atomic>
+#include <bitset>
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
+#include <unordered_map>
 #include <utility>

 #include "common/assert.h"
@@ -15,6 +18,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
@@ -25,6 +29,7 @@
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/lock.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
@@ -44,7 +49,7 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
    std::lock_guard lock{HLE::g_hle_lock};

    std::shared_ptr<Thread> thread =
-        system.Kernel().RetrieveThreadFromWakeupCallbackHandleTable(proper_handle);
+        system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
    if (thread == nullptr) {
        LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle);
        return;
@@ -97,8 +102,8 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
 }

 struct KernelCore::Impl {
-    explicit Impl(Core::System& system)
-        : system{system}, global_scheduler{system}, synchronization{system} {}
+    explicit Impl(Core::System& system, KernelCore& kernel)
+        : system{system}, global_scheduler{kernel}, synchronization{system}, time_manager{system} {}

    void Initialize(KernelCore& kernel) {
        Shutdown();
@@ -120,7 +125,7 @@ struct KernelCore::Impl {

        system_resource_limit = nullptr;

-        thread_wakeup_callback_handle_table.Clear();
+        global_handle_table.Clear();
        thread_wakeup_event_type = nullptr;
        preemption_event = nullptr;

@@ -138,8 +143,8 @@ struct KernelCore::Impl {

    void InitializePhysicalCores() {
        exclusive_monitor =
-            Core::MakeExclusiveMonitor(system.Memory(), global_scheduler.CpuCoresCount());
-        for (std::size_t i = 0; i < global_scheduler.CpuCoresCount(); i++) {
+            Core::MakeExclusiveMonitor(system.Memory(), Core::Hardware::NUM_CPU_CORES);
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
            cores.emplace_back(system, i, *exclusive_monitor);
        }
    }
@@ -181,9 +186,57 @@ struct KernelCore::Impl {
            return;
        }

+        for (auto& core : cores) {
+            core.SetIs64Bit(process->Is64BitProcess());
+        }
+
        system.Memory().SetCurrentPageTable(*process);
    }

+    void RegisterCoreThread(std::size_t core_id) {
+        std::unique_lock lock{register_thread_mutex};
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+        ASSERT(it == host_thread_ids.end());
+        ASSERT(!registered_core_threads[core_id]);
+        host_thread_ids[this_id] = static_cast<u32>(core_id);
+        registered_core_threads.set(core_id);
+    }
+
+    void RegisterHostThread() {
+        std::unique_lock lock{register_thread_mutex};
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        ASSERT(it == host_thread_ids.end());
+        host_thread_ids[this_id] = registered_thread_ids++;
+    }
+
+    u32 GetCurrentHostThreadID() const {
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        if (it == host_thread_ids.end()) {
+            return Core::INVALID_HOST_THREAD_ID;
+        }
+        return it->second;
+    }
+
+    Core::EmuThreadHandle GetCurrentEmuThreadID() const {
+        Core::EmuThreadHandle result = Core::EmuThreadHandle::InvalidHandle();
+        result.host_handle = GetCurrentHostThreadID();
+        if (result.host_handle >= Core::Hardware::NUM_CPU_CORES) {
+            return result;
+        }
+        const Kernel::Scheduler& sched = cores[result.host_handle].Scheduler();
+        const Kernel::Thread* current = sched.GetCurrentThread();
+        if (current != nullptr) {
+            result.guest_handle = current->GetGlobalHandle();
+        } else {
+            result.guest_handle = InvalidHandle;
+        }
+        return result;
+    }
+
    std::atomic<u32> next_object_id{0};
    std::atomic<u64> next_kernel_process_id{Process::InitialKIPIDMin};
    std::atomic<u64> next_user_process_id{Process::ProcessIDMin};
@@ -194,15 +247,16 @@ struct KernelCore::Impl {
    Process* current_process = nullptr;
    Kernel::GlobalScheduler global_scheduler;
    Kernel::Synchronization synchronization;
+    Kernel::TimeManager time_manager;

    std::shared_ptr<ResourceLimit> system_resource_limit;

    std::shared_ptr<Core::Timing::EventType> thread_wakeup_event_type;
    std::shared_ptr<Core::Timing::EventType> preemption_event;

-    // TODO(yuriks): This can be removed if Thread objects are explicitly pooled in the future,
-    // allowing us to simply use a pool index or similar.
-    Kernel::HandleTable thread_wakeup_callback_handle_table;
+    // This is the kernel's handle table or supervisor handle table which
+    // stores all the objects in place.
+    Kernel::HandleTable global_handle_table;

    /// Map of named ports managed by the kernel, which can be retrieved using
    /// the ConnectToPort SVC.
@@ -211,11 +265,17 @@ struct KernelCore::Impl {
    std::unique_ptr<Core::ExclusiveMonitor> exclusive_monitor;
    std::vector<Kernel::PhysicalCore> cores;

+    // 0-3 IDs represent core threads, >3 represent others
+    std::unordered_map<std::thread::id, u32> host_thread_ids;
+    u32 registered_thread_ids{Core::Hardware::NUM_CPU_CORES};
+    std::bitset<Core::Hardware::NUM_CPU_CORES> registered_core_threads;
+    std::mutex register_thread_mutex;
+
    // System context
    Core::System& system;
 };

-KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
+KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system, *this)} {}
 KernelCore::~KernelCore() {
    Shutdown();
 }
@@ -232,9 +292,8 @@ std::shared_ptr<ResourceLimit> KernelCore::GetSystemResourceLimit() const {
    return impl->system_resource_limit;
 }

-std::shared_ptr<Thread> KernelCore::RetrieveThreadFromWakeupCallbackHandleTable(
-    Handle handle) const {
-    return impl->thread_wakeup_callback_handle_table.Get<Thread>(handle);
+std::shared_ptr<Thread> KernelCore::RetrieveThreadFromGlobalHandleTable(Handle handle) const {
+    return impl->global_handle_table.Get<Thread>(handle);
 }

 void KernelCore::AppendNewProcess(std::shared_ptr<Process> process) {
@@ -265,6 +324,14 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
    return impl->global_scheduler;
 }

+Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) {
+    return impl->cores[id].Scheduler();
+}
+
+const Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) const {
+    return impl->cores[id].Scheduler();
+}
+
 Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) {
    return impl->cores[id];
 }
@@ -281,6 +348,14 @@ const Kernel::Synchronization& KernelCore::Synchronization() const {
    return impl->synchronization;
 }

+Kernel::TimeManager& KernelCore::TimeManager() {
+    return impl->time_manager;
+}
+
+const Kernel::TimeManager& KernelCore::TimeManager() const {
+    return impl->time_manager;
+}
+
 Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() {
    return *impl->exclusive_monitor;
 }
@@ -338,12 +413,28 @@ const std::shared_ptr<Core::Timing::EventType>& KernelCore::ThreadWakeupCallback
    return impl->thread_wakeup_event_type;
 }

-Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() {
-    return impl->thread_wakeup_callback_handle_table;
+Kernel::HandleTable& KernelCore::GlobalHandleTable() {
+    return impl->global_handle_table;
 }

-const Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() const {
-    return impl->thread_wakeup_callback_handle_table;
+const Kernel::HandleTable& KernelCore::GlobalHandleTable() const {
+    return impl->global_handle_table;
+}
+
+void KernelCore::RegisterCoreThread(std::size_t core_id) {
+    impl->RegisterCoreThread(core_id);
+}
+
+void KernelCore::RegisterHostThread() {
+    impl->RegisterHostThread();
+}
+
+u32 KernelCore::GetCurrentHostThreadID() const {
+    return impl->GetCurrentHostThreadID();
+}
+
+Core::EmuThreadHandle KernelCore::GetCurrentEmuThreadID() const {
+    return impl->GetCurrentEmuThreadID();
 }

 } // namespace Kernel
@@ -11,6 +11,7 @@
 #include "core/hle/kernel/object.h"

 namespace Core {
+struct EmuThreadHandle;
 class ExclusiveMonitor;
 class System;
 } // namespace Core
@@ -29,8 +30,10 @@ class HandleTable;
 class PhysicalCore;
 class Process;
 class ResourceLimit;
+class Scheduler;
 class Synchronization;
 class Thread;
+class TimeManager;

 /// Represents a single instance of the kernel.
 class KernelCore {
@@ -64,7 +67,7 @@ public:
    std::shared_ptr<ResourceLimit> GetSystemResourceLimit() const;

    /// Retrieves a shared pointer to a Thread instance within the thread wakeup handle table.
-    std::shared_ptr<Thread> RetrieveThreadFromWakeupCallbackHandleTable(Handle handle) const;
+    std::shared_ptr<Thread> RetrieveThreadFromGlobalHandleTable(Handle handle) const;

    /// Adds the given shared pointer to an internal list of active processes.
    void AppendNewProcess(std::shared_ptr<Process> process);
@@ -87,6 +90,12 @@ public:
    /// Gets the sole instance of the global scheduler
    const Kernel::GlobalScheduler& GlobalScheduler() const;

+    /// Gets the sole instance of the Scheduler assoviated with cpu core 'id'
+    Kernel::Scheduler& Scheduler(std::size_t id);
+
+    /// Gets the sole instance of the Scheduler assoviated with cpu core 'id'
+    const Kernel::Scheduler& Scheduler(std::size_t id) const;
+
    /// Gets the an instance of the respective physical CPU core.
    Kernel::PhysicalCore& PhysicalCore(std::size_t id);

@@ -99,6 +108,12 @@ public:
    /// Gets the an instance of the Synchronization Interface.
    const Kernel::Synchronization& Synchronization() const;

+    /// Gets the an instance of the TimeManager Interface.
+    Kernel::TimeManager& TimeManager();
+
+    /// Gets the an instance of the TimeManager Interface.
+    const Kernel::TimeManager& TimeManager() const;
+
    /// Stops execution of 'id' core, in order to reschedule a new thread.
    void PrepareReschedule(std::size_t id);

@@ -120,6 +135,18 @@ public:
    /// Determines whether or not the given port is a valid named port.
    bool IsValidNamedPort(NamedPortTable::const_iterator port) const;

+    /// Gets the current host_thread/guest_thread handle.
+    Core::EmuThreadHandle GetCurrentEmuThreadID() const;
+
+    /// Gets the current host_thread handle.
+    u32 GetCurrentHostThreadID() const;
+
+    /// Register the current thread as a CPU Core Thread.
+    void RegisterCoreThread(std::size_t core_id);
+
+    /// Register the current thread as a non CPU core thread.
+    void RegisterHostThread();
+
 private:
    friend class Object;
    friend class Process;
@@ -140,11 +167,11 @@ private:
    /// Retrieves the event type used for thread wakeup callbacks.
    const std::shared_ptr<Core::Timing::EventType>& ThreadWakeupCallbackEventType() const;

-    /// Provides a reference to the thread wakeup callback handle table.
-    Kernel::HandleTable& ThreadWakeupCallbackHandleTable();
+    /// Provides a reference to the global handle table.
+    Kernel::HandleTable& GlobalHandleTable();

-    /// Provides a const reference to the thread wakeup callback handle table.
-    const Kernel::HandleTable& ThreadWakeupCallbackHandleTable() const;
+    /// Provides a const reference to the global handle table.
+    const Kernel::HandleTable& GlobalHandleTable() const;

    struct Impl;
    std::unique_ptr<Impl> impl;
@@ -5,7 +5,8 @@
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
@@ -20,13 +21,17 @@ PhysicalCore::PhysicalCore(Core::System& system, std::size_t id,
                           Core::ExclusiveMonitor& exclusive_monitor)
    : core_index{id} {
 #ifdef ARCHITECTURE_x86_64
-    arm_interface = std::make_unique<Core::ARM_Dynarmic>(system, exclusive_monitor, core_index);
+    arm_interface_32 =
+        std::make_unique<Core::ARM_Dynarmic_32>(system, exclusive_monitor, core_index);
+    arm_interface_64 =
+        std::make_unique<Core::ARM_Dynarmic_64>(system, exclusive_monitor, core_index);
+
 #else
    arm_interface = std::make_shared<Core::ARM_Unicorn>(system);
    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
 #endif

-    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index);
+    scheduler = std::make_unique<Kernel::Scheduler>(system, core_index);
 }

 PhysicalCore::~PhysicalCore() = default;
@@ -48,4 +53,12 @@ void PhysicalCore::Shutdown() {
    scheduler->Shutdown();
 }

+void PhysicalCore::SetIs64Bit(bool is_64_bit) {
+    if (is_64_bit) {
+        arm_interface = arm_interface_64.get();
+    } else {
+        arm_interface = arm_interface_32.get();
+    }
+}
+
 } // namespace Kernel
@@ -68,10 +68,14 @@ public:
        return *scheduler;
    }

+    void SetIs64Bit(bool is_64_bit);
+
 private:
    std::size_t core_index;
-    std::unique_ptr<Core::ARM_Interface> arm_interface;
+    std::unique_ptr<Core::ARM_Interface> arm_interface_32;
+    std::unique_ptr<Core::ARM_Interface> arm_interface_64;
    std::unique_ptr<Kernel::Scheduler> scheduler;
+    Core::ARM_Interface* arm_interface{};
 };

 } // namespace Kernel
@@ -42,7 +42,8 @@ void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority) {

    // Register 1 must be a handle to the main thread
    const Handle thread_handle = owner_process.GetHandleTable().Create(thread).Unwrap();
-    thread->GetContext().cpu_registers[1] = thread_handle;
+    thread->GetContext32().cpu_registers[1] = thread_handle;
+    thread->GetContext64().cpu_registers[1] = thread_handle;

    // Threads by default are dormant, wake up the main thread so it runs when the scheduler fires
    thread->ResumeFromWait();
@@ -18,10 +18,11 @@
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/time_manager.h"

 namespace Kernel {

-GlobalScheduler::GlobalScheduler(Core::System& system) : system{system} {}
+GlobalScheduler::GlobalScheduler(KernelCore& kernel) : kernel{kernel} {}

 GlobalScheduler::~GlobalScheduler() = default;

@@ -35,7 +36,7 @@ void GlobalScheduler::RemoveThread(std::shared_ptr<Thread> thread) {
 }

 void GlobalScheduler::UnloadThread(std::size_t core) {
-    Scheduler& sched = system.Scheduler(core);
+    Scheduler& sched = kernel.Scheduler(core);
    sched.UnloadThread();
 }

@@ -50,7 +51,7 @@ void GlobalScheduler::SelectThread(std::size_t core) {
        sched.is_context_switch_pending = sched.selected_thread != sched.current_thread;
        std::atomic_thread_fence(std::memory_order_seq_cst);
    };
-    Scheduler& sched = system.Scheduler(core);
+    Scheduler& sched = kernel.Scheduler(core);
    Thread* current_thread = nullptr;
    // Step 1: Get top thread in schedule queue.
    current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
@@ -356,8 +357,34 @@ void GlobalScheduler::Shutdown() {
    thread_list.clear();
 }

-Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id)
-    : system(system), cpu_core(cpu_core), core_id(core_id) {}
+void GlobalScheduler::Lock() {
+    Core::EmuThreadHandle current_thread = kernel.GetCurrentEmuThreadID();
+    if (current_thread == current_owner) {
+        ++scope_lock;
+    } else {
+        inner_lock.lock();
+        current_owner = current_thread;
+        ASSERT(current_owner != Core::EmuThreadHandle::InvalidHandle());
+        scope_lock = 1;
+    }
+}
+
+void GlobalScheduler::Unlock() {
+    if (--scope_lock != 0) {
+        ASSERT(scope_lock > 0);
+        return;
+    }
+    for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+        SelectThread(i);
+    }
+    current_owner = Core::EmuThreadHandle::InvalidHandle();
+    scope_lock = 1;
+    inner_lock.unlock();
+    // TODO(Blinkhawk): Setup the interrupts and change context on current core.
+}
+
+Scheduler::Scheduler(Core::System& system, std::size_t core_id)
+    : system{system}, core_id{core_id} {}

 Scheduler::~Scheduler() = default;

@@ -395,9 +422,10 @@ void Scheduler::UnloadThread() {

    // Save context for previous thread
    if (previous_thread) {
-        cpu_core.SaveContext(previous_thread->GetContext());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());

        if (previous_thread->GetStatus() == ThreadStatus::Running) {
            // This is only the case when a reschedule is triggered without the current thread
@@ -424,9 +452,10 @@ void Scheduler::SwitchContext() {

    // Save context for previous thread
    if (previous_thread) {
-        cpu_core.SaveContext(previous_thread->GetContext());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());

        if (previous_thread->GetStatus() == ThreadStatus::Running) {
            // This is only the case when a reschedule is triggered without the current thread
@@ -454,9 +483,10 @@ void Scheduler::SwitchContext() {
            system.Kernel().MakeCurrentProcess(thread_owner_process);
        }

-        cpu_core.LoadContext(new_thread->GetContext());
-        cpu_core.SetTlsAddress(new_thread->GetTLSAddress());
-        cpu_core.SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
+        system.ArmInterface(core_id).LoadContext(new_thread->GetContext32());
+        system.ArmInterface(core_id).LoadContext(new_thread->GetContext64());
+        system.ArmInterface(core_id).SetTlsAddress(new_thread->GetTLSAddress());
+        system.ArmInterface(core_id).SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
    } else {
        current_thread = nullptr;
        // Note: We do not reset the current process and current page table when idling because
@@ -485,4 +515,27 @@ void Scheduler::Shutdown() {
    selected_thread = nullptr;
 }

+SchedulerLock::SchedulerLock(KernelCore& kernel) : kernel{kernel} {
+    kernel.GlobalScheduler().Lock();
+}
+
+SchedulerLock::~SchedulerLock() {
+    kernel.GlobalScheduler().Unlock();
+}
+
+SchedulerLockAndSleep::SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle,
+                                             Thread* time_task, s64 nanoseconds)
+    : SchedulerLock{kernel}, event_handle{event_handle}, time_task{time_task}, nanoseconds{
+                                                                                   nanoseconds} {
+    event_handle = InvalidHandle;
+}
+
+SchedulerLockAndSleep::~SchedulerLockAndSleep() {
+    if (sleep_cancelled) {
+        return;
+    }
+    auto& time_manager = kernel.TimeManager();
+    time_manager.ScheduleTimeEvent(event_handle, time_task, nanoseconds);
+}
+
 } // namespace Kernel
@@ -6,6 +6,7 @@

 #include <atomic>
 #include <memory>
+#include <mutex>
 #include <vector>

 #include "common/common_types.h"
@@ -20,11 +21,13 @@ class System;

 namespace Kernel {

+class KernelCore;
 class Process;
+class SchedulerLock;

 class GlobalScheduler final {
 public:
-    explicit GlobalScheduler(Core::System& system);
+    explicit GlobalScheduler(KernelCore& kernel);
    ~GlobalScheduler();

    /// Adds a new thread to the scheduler
@@ -138,6 +141,14 @@ public:
    void Shutdown();

 private:
+    friend class SchedulerLock;
+
+    /// Lock the scheduler to the current thread.
+    void Lock();
+
+    /// Unlocks the scheduler, reselects threads, interrupts cores for rescheduling
+    /// and reschedules current core if needed.
+    void Unlock();
    /**
     * Transfers a thread into an specific core. If the destination_core is -1
     * it will be unscheduled from its source code and added into its suggested
@@ -158,14 +169,19 @@ private:
    // ordered from Core 0 to Core 3.
    std::array<u32, Core::Hardware::NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};

+    /// Scheduler lock mechanisms.
+    std::mutex inner_lock{}; // TODO(Blinkhawk): Replace for a SpinLock
+    std::atomic<s64> scope_lock{};
+    Core::EmuThreadHandle current_owner{Core::EmuThreadHandle::InvalidHandle()};
+
    /// Lists all thread ids that aren't deleted/etc.
    std::vector<std::shared_ptr<Thread>> thread_list;
-    Core::System& system;
+    KernelCore& kernel;
 };

 class Scheduler final {
 public:
-    explicit Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id);
+    explicit Scheduler(Core::System& system, std::size_t core_id);
    ~Scheduler();

    /// Returns whether there are any threads that are ready to run.
@@ -219,7 +235,6 @@ private:
    std::shared_ptr<Thread> selected_thread = nullptr;

    Core::System& system;
-    Core::ARM_Interface& cpu_core;
    u64 last_context_switch_time = 0;
    u64 idle_selection_count = 0;
    const std::size_t core_id;
@@ -227,4 +242,30 @@ private:
    bool is_context_switch_pending = false;
 };

+class SchedulerLock {
+public:
+    explicit SchedulerLock(KernelCore& kernel);
+    ~SchedulerLock();
+
+protected:
+    KernelCore& kernel;
+};
+
+class SchedulerLockAndSleep : public SchedulerLock {
+public:
+    explicit SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle, Thread* time_task,
+                                   s64 nanoseconds);
+    ~SchedulerLockAndSleep();
+
+    void CancelSleep() {
+        sleep_cancelled = true;
+    }
+
+private:
+    Handle& event_handle;
+    Thread* time_task;
+    s64 nanoseconds;
+    bool sleep_cancelled{};
+};
+
 } // namespace Kernel
@@ -187,6 +187,13 @@ static ResultCode SetHeapSize(Core::System& system, VAddr* heap_addr, u64 heap_s
    return RESULT_SUCCESS;
 }

+static ResultCode SetHeapSize32(Core::System& system, u32* heap_addr, u32 heap_size) {
+    VAddr temp_heap_addr{};
+    const ResultCode result{SetHeapSize(system, &temp_heap_addr, heap_size)};
+    *heap_addr = static_cast<u32>(temp_heap_addr);
+    return result;
+}
+
 static ResultCode SetMemoryPermission(Core::System& system, VAddr addr, u64 size, u32 prot) {
    LOG_TRACE(Kernel_SVC, "called, addr=0x{:X}, size=0x{:X}, prot=0x{:X}", addr, size, prot);

@@ -371,6 +378,12 @@ static ResultCode ConnectToNamedPort(Core::System& system, Handle* out_handle,
    return RESULT_SUCCESS;
 }

+static ResultCode ConnectToNamedPort32(Core::System& system, Handle* out_handle,
+                                       u32 port_name_address) {
+
+    return ConnectToNamedPort(system, out_handle, port_name_address);
+}
+
 /// Makes a blocking IPC call to an OS service.
 static ResultCode SendSyncRequest(Core::System& system, Handle handle) {
    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
@@ -390,6 +403,10 @@ static ResultCode SendSyncRequest(Core::System& system, Handle handle) {
    return session->SendSyncRequest(SharedFrom(thread), system.Memory());
 }

+static ResultCode SendSyncRequest32(Core::System& system, Handle handle) {
+    return SendSyncRequest(system, handle);
+}
+
 /// Get the ID for the specified thread.
 static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle thread_handle) {
    LOG_TRACE(Kernel_SVC, "called thread=0x{:08X}", thread_handle);
@@ -405,6 +422,17 @@ static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle threa
    return RESULT_SUCCESS;
 }

+static ResultCode GetThreadId32(Core::System& system, u32* thread_id_low, u32* thread_id_high,
+                                Handle thread_handle) {
+    u64 thread_id{};
+    const ResultCode result{GetThreadId(system, &thread_id, thread_handle)};
+
+    *thread_id_low = static_cast<u32>(thread_id >> 32);
+    *thread_id_high = static_cast<u32>(thread_id & std::numeric_limits<u32>::max());
+
+    return result;
+}
+
 /// Gets the ID of the specified process or a specified thread's owning process.
 static ResultCode GetProcessId(Core::System& system, u64* process_id, Handle handle) {
    LOG_DEBUG(Kernel_SVC, "called handle=0x{:08X}", handle);
@@ -479,6 +507,12 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr
    return result;
 }

+static ResultCode WaitSynchronization32(Core::System& system, u32 timeout_low, u32 handles_address,
+                                        s32 handle_count, u32 timeout_high, Handle* index) {
+    const s64 nano_seconds{(static_cast<s64>(timeout_high) << 32) | static_cast<s64>(timeout_low)};
+    return WaitSynchronization(system, index, handles_address, handle_count, nano_seconds);
+}
+
 /// Resumes a thread waiting on WaitSynchronization
 static ResultCode CancelSynchronization(Core::System& system, Handle thread_handle) {
    LOG_TRACE(Kernel_SVC, "called thread=0x{:X}", thread_handle);
@@ -917,6 +951,18 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    }
 }

+static ResultCode GetInfo32(Core::System& system, u32* result_low, u32* result_high, u32 sub_id_low,
+                            u32 info_id, u32 handle, u32 sub_id_high) {
+    const u64 sub_id{static_cast<u64>(sub_id_low | (static_cast<u64>(sub_id_high) << 32))};
+    u64 res_value{};
+
+    const ResultCode result{GetInfo(system, &res_value, info_id, handle, sub_id)};
+    *result_high = static_cast<u32>(res_value >> 32);
+    *result_low = static_cast<u32>(res_value & std::numeric_limits<u32>::max());
+
+    return result;
+}
+
 /// Maps memory at a desired address
 static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
@@ -1058,7 +1104,7 @@ static ResultCode GetThreadContext(Core::System& system, VAddr thread_context, H
        return ERR_BUSY;
    }

-    Core::ARM_Interface::ThreadContext ctx = thread->GetContext();
+    Core::ARM_Interface::ThreadContext64 ctx = thread->GetContext64();
    // Mask away mode bits, interrupt bits, IL bit, and other reserved bits.
    ctx.pstate &= 0xFF0FFE20;

@@ -1088,6 +1134,10 @@ static ResultCode GetThreadPriority(Core::System& system, u32* priority, Handle
    return RESULT_SUCCESS;
 }

+static ResultCode GetThreadPriority32(Core::System& system, u32* priority, Handle handle) {
+    return GetThreadPriority(system, priority, handle);
+}
+
 /// Sets the priority for the specified thread
 static ResultCode SetThreadPriority(Core::System& system, Handle handle, u32 priority) {
    LOG_TRACE(Kernel_SVC, "called");
@@ -1259,6 +1309,11 @@ static ResultCode QueryMemory(Core::System& system, VAddr memory_info_address,
                              query_address);
 }

+static ResultCode QueryMemory32(Core::System& system, u32 memory_info_address,
+                                u32 page_info_address, u32 query_address) {
+    return QueryMemory(system, memory_info_address, page_info_address, query_address);
+}
+
 static ResultCode MapProcessCodeMemory(Core::System& system, Handle process_handle, u64 dst_address,
                                       u64 src_address, u64 size) {
    LOG_DEBUG(Kernel_SVC,
@@ -1675,6 +1730,10 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
    }
 }

+static void SignalProcessWideKey32(Core::System& system, u32 condition_variable_addr, s32 target) {
+    SignalProcessWideKey(system, condition_variable_addr, target);
+}
+
 // Wait for an address (via Address Arbiter)
 static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                 s64 timeout) {
@@ -1760,6 +1819,10 @@ static ResultCode CloseHandle(Core::System& system, Handle handle) {
    return handle_table.Close(handle);
 }

+static ResultCode CloseHandle32(Core::System& system, Handle handle) {
+    return CloseHandle(system, handle);
+}
+
 /// Clears the signaled state of an event or process.
 static ResultCode ResetSignal(Core::System& system, Handle handle) {
    LOG_DEBUG(Kernel_SVC, "called handle 0x{:08X}", handle);
@@ -2317,69 +2380,196 @@ struct FunctionDef {
 };
 } // namespace

-static const FunctionDef SVC_Table[] = {
+static const FunctionDef SVC_Table_32[] = {
    {0x00, nullptr, "Unknown"},
-    {0x01, SvcWrap<SetHeapSize>, "SetHeapSize"},
-    {0x02, SvcWrap<SetMemoryPermission>, "SetMemoryPermission"},
-    {0x03, SvcWrap<SetMemoryAttribute>, "SetMemoryAttribute"},
-    {0x04, SvcWrap<MapMemory>, "MapMemory"},
-    {0x05, SvcWrap<UnmapMemory>, "UnmapMemory"},
-    {0x06, SvcWrap<QueryMemory>, "QueryMemory"},
-    {0x07, SvcWrap<ExitProcess>, "ExitProcess"},
-    {0x08, SvcWrap<CreateThread>, "CreateThread"},
-    {0x09, SvcWrap<StartThread>, "StartThread"},
-    {0x0A, SvcWrap<ExitThread>, "ExitThread"},
-    {0x0B, SvcWrap<SleepThread>, "SleepThread"},
-    {0x0C, SvcWrap<GetThreadPriority>, "GetThreadPriority"},
-    {0x0D, SvcWrap<SetThreadPriority>, "SetThreadPriority"},
-    {0x0E, SvcWrap<GetThreadCoreMask>, "GetThreadCoreMask"},
-    {0x0F, SvcWrap<SetThreadCoreMask>, "SetThreadCoreMask"},
-    {0x10, SvcWrap<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"},
-    {0x11, SvcWrap<SignalEvent>, "SignalEvent"},
-    {0x12, SvcWrap<ClearEvent>, "ClearEvent"},
-    {0x13, SvcWrap<MapSharedMemory>, "MapSharedMemory"},
-    {0x14, SvcWrap<UnmapSharedMemory>, "UnmapSharedMemory"},
-    {0x15, SvcWrap<CreateTransferMemory>, "CreateTransferMemory"},
-    {0x16, SvcWrap<CloseHandle>, "CloseHandle"},
-    {0x17, SvcWrap<ResetSignal>, "ResetSignal"},
-    {0x18, SvcWrap<WaitSynchronization>, "WaitSynchronization"},
-    {0x19, SvcWrap<CancelSynchronization>, "CancelSynchronization"},
-    {0x1A, SvcWrap<ArbitrateLock>, "ArbitrateLock"},
-    {0x1B, SvcWrap<ArbitrateUnlock>, "ArbitrateUnlock"},
-    {0x1C, SvcWrap<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"},
-    {0x1D, SvcWrap<SignalProcessWideKey>, "SignalProcessWideKey"},
-    {0x1E, SvcWrap<GetSystemTick>, "GetSystemTick"},
-    {0x1F, SvcWrap<ConnectToNamedPort>, "ConnectToNamedPort"},
+    {0x01, SvcWrap32<SetHeapSize32>, "SetHeapSize32"},
+    {0x02, nullptr, "Unknown"},
+    {0x03, nullptr, "SetMemoryAttribute32"},
+    {0x04, nullptr, "MapMemory32"},
+    {0x05, nullptr, "UnmapMemory32"},
+    {0x06, SvcWrap32<QueryMemory32>, "QueryMemory32"},
+    {0x07, nullptr, "ExitProcess32"},
+    {0x08, nullptr, "CreateThread32"},
+    {0x09, nullptr, "StartThread32"},
+    {0x0a, nullptr, "ExitThread32"},
+    {0x0b, nullptr, "SleepThread32"},
+    {0x0c, SvcWrap32<GetThreadPriority32>, "GetThreadPriority32"},
+    {0x0d, nullptr, "SetThreadPriority32"},
+    {0x0e, nullptr, "GetThreadCoreMask32"},
+    {0x0f, nullptr, "SetThreadCoreMask32"},
+    {0x10, nullptr, "GetCurrentProcessorNumber32"},
+    {0x11, nullptr, "SignalEvent32"},
+    {0x12, nullptr, "ClearEvent32"},
+    {0x13, nullptr, "MapSharedMemory32"},
+    {0x14, nullptr, "UnmapSharedMemory32"},
+    {0x15, nullptr, "CreateTransferMemory32"},
+    {0x16, SvcWrap32<CloseHandle32>, "CloseHandle32"},
+    {0x17, nullptr, "ResetSignal32"},
+    {0x18, SvcWrap32<WaitSynchronization32>, "WaitSynchronization32"},
+    {0x19, nullptr, "CancelSynchronization32"},
+    {0x1a, nullptr, "ArbitrateLock32"},
+    {0x1b, nullptr, "ArbitrateUnlock32"},
+    {0x1c, nullptr, "WaitProcessWideKeyAtomic32"},
+    {0x1d, SvcWrap32<SignalProcessWideKey32>, "SignalProcessWideKey32"},
+    {0x1e, nullptr, "GetSystemTick32"},
+    {0x1f, SvcWrap32<ConnectToNamedPort32>, "ConnectToNamedPort32"},
+    {0x20, nullptr, "Unknown"},
+    {0x21, SvcWrap32<SendSyncRequest32>, "SendSyncRequest32"},
+    {0x22, nullptr, "SendSyncRequestWithUserBuffer32"},
+    {0x23, nullptr, "Unknown"},
+    {0x24, nullptr, "GetProcessId32"},
+    {0x25, SvcWrap32<GetThreadId32>, "GetThreadId32"},
+    {0x26, nullptr, "Break32"},
+    {0x27, nullptr, "OutputDebugString32"},
+    {0x28, nullptr, "Unknown"},
+    {0x29, SvcWrap32<GetInfo32>, "GetInfo32"},
+    {0x2a, nullptr, "Unknown"},
+    {0x2b, nullptr, "Unknown"},
+    {0x2c, nullptr, "MapPhysicalMemory32"},
+    {0x2d, nullptr, "UnmapPhysicalMemory32"},
+    {0x2e, nullptr, "Unknown"},
+    {0x2f, nullptr, "Unknown"},
+    {0x30, nullptr, "Unknown"},
+    {0x31, nullptr, "Unknown"},
+    {0x32, nullptr, "SetThreadActivity32"},
+    {0x33, nullptr, "GetThreadContext32"},
+    {0x34, nullptr, "WaitForAddress32"},
+    {0x35, nullptr, "SignalToAddress32"},
+    {0x36, nullptr, "Unknown"},
+    {0x37, nullptr, "Unknown"},
+    {0x38, nullptr, "Unknown"},
+    {0x39, nullptr, "Unknown"},
+    {0x3a, nullptr, "Unknown"},
+    {0x3b, nullptr, "Unknown"},
+    {0x3c, nullptr, "Unknown"},
+    {0x3d, nullptr, "Unknown"},
+    {0x3e, nullptr, "Unknown"},
+    {0x3f, nullptr, "Unknown"},
+    {0x40, nullptr, "CreateSession32"},
+    {0x41, nullptr, "AcceptSession32"},
+    {0x42, nullptr, "Unknown"},
+    {0x43, nullptr, "ReplyAndReceive32"},
+    {0x44, nullptr, "Unknown"},
+    {0x45, nullptr, "CreateEvent32"},
+    {0x46, nullptr, "Unknown"},
+    {0x47, nullptr, "Unknown"},
+    {0x48, nullptr, "Unknown"},
+    {0x49, nullptr, "Unknown"},
+    {0x4a, nullptr, "Unknown"},
+    {0x4b, nullptr, "Unknown"},
+    {0x4c, nullptr, "Unknown"},
+    {0x4d, nullptr, "Unknown"},
+    {0x4e, nullptr, "Unknown"},
+    {0x4f, nullptr, "Unknown"},
+    {0x50, nullptr, "Unknown"},
+    {0x51, nullptr, "Unknown"},
+    {0x52, nullptr, "Unknown"},
+    {0x53, nullptr, "Unknown"},
+    {0x54, nullptr, "Unknown"},
+    {0x55, nullptr, "Unknown"},
+    {0x56, nullptr, "Unknown"},
+    {0x57, nullptr, "Unknown"},
+    {0x58, nullptr, "Unknown"},
+    {0x59, nullptr, "Unknown"},
+    {0x5a, nullptr, "Unknown"},
+    {0x5b, nullptr, "Unknown"},
+    {0x5c, nullptr, "Unknown"},
+    {0x5d, nullptr, "Unknown"},
+    {0x5e, nullptr, "Unknown"},
+    {0x5F, nullptr, "FlushProcessDataCache32"},
+    {0x60, nullptr, "Unknown"},
+    {0x61, nullptr, "Unknown"},
+    {0x62, nullptr, "Unknown"},
+    {0x63, nullptr, "Unknown"},
+    {0x64, nullptr, "Unknown"},
+    {0x65, nullptr, "GetProcessList32"},
+    {0x66, nullptr, "Unknown"},
+    {0x67, nullptr, "Unknown"},
+    {0x68, nullptr, "Unknown"},
+    {0x69, nullptr, "Unknown"},
+    {0x6A, nullptr, "Unknown"},
+    {0x6B, nullptr, "Unknown"},
+    {0x6C, nullptr, "Unknown"},
+    {0x6D, nullptr, "Unknown"},
+    {0x6E, nullptr, "Unknown"},
+    {0x6f, nullptr, "GetSystemInfo32"},
+    {0x70, nullptr, "CreatePort32"},
+    {0x71, nullptr, "ManageNamedPort32"},
+    {0x72, nullptr, "ConnectToPort32"},
+    {0x73, nullptr, "SetProcessMemoryPermission32"},
+    {0x74, nullptr, "Unknown"},
+    {0x75, nullptr, "Unknown"},
+    {0x76, nullptr, "Unknown"},
+    {0x77, nullptr, "MapProcessCodeMemory32"},
+    {0x78, nullptr, "UnmapProcessCodeMemory32"},
+    {0x79, nullptr, "Unknown"},
+    {0x7A, nullptr, "Unknown"},
+    {0x7B, nullptr, "TerminateProcess32"},
+};
+
+static const FunctionDef SVC_Table_64[] = {
+    {0x00, nullptr, "Unknown"},
+    {0x01, SvcWrap64<SetHeapSize>, "SetHeapSize"},
+    {0x02, SvcWrap64<SetMemoryPermission>, "SetMemoryPermission"},
+    {0x03, SvcWrap64<SetMemoryAttribute>, "SetMemoryAttribute"},
+    {0x04, SvcWrap64<MapMemory>, "MapMemory"},
+    {0x05, SvcWrap64<UnmapMemory>, "UnmapMemory"},
+    {0x06, SvcWrap64<QueryMemory>, "QueryMemory"},
+    {0x07, SvcWrap64<ExitProcess>, "ExitProcess"},
+    {0x08, SvcWrap64<CreateThread>, "CreateThread"},
+    {0x09, SvcWrap64<StartThread>, "StartThread"},
+    {0x0A, SvcWrap64<ExitThread>, "ExitThread"},
+    {0x0B, SvcWrap64<SleepThread>, "SleepThread"},
+    {0x0C, SvcWrap64<GetThreadPriority>, "GetThreadPriority"},
+    {0x0D, SvcWrap64<SetThreadPriority>, "SetThreadPriority"},
+    {0x0E, SvcWrap64<GetThreadCoreMask>, "GetThreadCoreMask"},
+    {0x0F, SvcWrap64<SetThreadCoreMask>, "SetThreadCoreMask"},
+    {0x10, SvcWrap64<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"},
+    {0x11, SvcWrap64<SignalEvent>, "SignalEvent"},
+    {0x12, SvcWrap64<ClearEvent>, "ClearEvent"},
+    {0x13, SvcWrap64<MapSharedMemory>, "MapSharedMemory"},
+    {0x14, SvcWrap64<UnmapSharedMemory>, "UnmapSharedMemory"},
+    {0x15, SvcWrap64<CreateTransferMemory>, "CreateTransferMemory"},
+    {0x16, SvcWrap64<CloseHandle>, "CloseHandle"},
+    {0x17, SvcWrap64<ResetSignal>, "ResetSignal"},
+    {0x18, SvcWrap64<WaitSynchronization>, "WaitSynchronization"},
+    {0x19, SvcWrap64<CancelSynchronization>, "CancelSynchronization"},
+    {0x1A, SvcWrap64<ArbitrateLock>, "ArbitrateLock"},
+    {0x1B, SvcWrap64<ArbitrateUnlock>, "ArbitrateUnlock"},
+    {0x1C, SvcWrap64<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"},
+    {0x1D, SvcWrap64<SignalProcessWideKey>, "SignalProcessWideKey"},
+    {0x1E, SvcWrap64<GetSystemTick>, "GetSystemTick"},
+    {0x1F, SvcWrap64<ConnectToNamedPort>, "ConnectToNamedPort"},
    {0x20, nullptr, "SendSyncRequestLight"},
-    {0x21, SvcWrap<SendSyncRequest>, "SendSyncRequest"},
+    {0x21, SvcWrap64<SendSyncRequest>, "SendSyncRequest"},
    {0x22, nullptr, "SendSyncRequestWithUserBuffer"},
    {0x23, nullptr, "SendAsyncRequestWithUserBuffer"},
-    {0x24, SvcWrap<GetProcessId>, "GetProcessId"},
-    {0x25, SvcWrap<GetThreadId>, "GetThreadId"},
-    {0x26, SvcWrap<Break>, "Break"},
-    {0x27, SvcWrap<OutputDebugString>, "OutputDebugString"},
+    {0x24, SvcWrap64<GetProcessId>, "GetProcessId"},
+    {0x25, SvcWrap64<GetThreadId>, "GetThreadId"},
+    {0x26, SvcWrap64<Break>, "Break"},
+    {0x27, SvcWrap64<OutputDebugString>, "OutputDebugString"},
    {0x28, nullptr, "ReturnFromException"},
-    {0x29, SvcWrap<GetInfo>, "GetInfo"},
+    {0x29, SvcWrap64<GetInfo>, "GetInfo"},
    {0x2A, nullptr, "FlushEntireDataCache"},
    {0x2B, nullptr, "FlushDataCache"},
-    {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"},
-    {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
+    {0x2C, SvcWrap64<MapPhysicalMemory>, "MapPhysicalMemory"},
+    {0x2D, SvcWrap64<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
    {0x2E, nullptr, "GetFutureThreadInfo"},
    {0x2F, nullptr, "GetLastThreadInfo"},
-    {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
-    {0x31, SvcWrap<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"},
-    {0x32, SvcWrap<SetThreadActivity>, "SetThreadActivity"},
-    {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"},
-    {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"},
-    {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"},
+    {0x30, SvcWrap64<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
+    {0x31, SvcWrap64<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"},
+    {0x32, SvcWrap64<SetThreadActivity>, "SetThreadActivity"},
+    {0x33, SvcWrap64<GetThreadContext>, "GetThreadContext"},
+    {0x34, SvcWrap64<WaitForAddress>, "WaitForAddress"},
+    {0x35, SvcWrap64<SignalToAddress>, "SignalToAddress"},
    {0x36, nullptr, "SynchronizePreemptionState"},
    {0x37, nullptr, "Unknown"},
    {0x38, nullptr, "Unknown"},
    {0x39, nullptr, "Unknown"},
    {0x3A, nullptr, "Unknown"},
    {0x3B, nullptr, "Unknown"},
-    {0x3C, SvcWrap<KernelDebug>, "KernelDebug"},
-    {0x3D, SvcWrap<ChangeKernelTraceState>, "ChangeKernelTraceState"},
+    {0x3C, SvcWrap64<KernelDebug>, "KernelDebug"},
+    {0x3D, SvcWrap64<ChangeKernelTraceState>, "ChangeKernelTraceState"},
    {0x3E, nullptr, "Unknown"},
    {0x3F, nullptr, "Unknown"},
    {0x40, nullptr, "CreateSession"},
@@ -2387,7 +2577,7 @@ static const FunctionDef SVC_Table[] = {
    {0x42, nullptr, "ReplyAndReceiveLight"},
    {0x43, nullptr, "ReplyAndReceive"},
    {0x44, nullptr, "ReplyAndReceiveWithUserBuffer"},
-    {0x45, SvcWrap<CreateEvent>, "CreateEvent"},
+    {0x45, SvcWrap64<CreateEvent>, "CreateEvent"},
    {0x46, nullptr, "Unknown"},
    {0x47, nullptr, "Unknown"},
    {0x48, nullptr, "MapPhysicalMemoryUnsafe"},
@@ -2398,9 +2588,9 @@ static const FunctionDef SVC_Table[] = {
    {0x4D, nullptr, "SleepSystem"},
    {0x4E, nullptr, "ReadWriteRegister"},
    {0x4F, nullptr, "SetProcessActivity"},
-    {0x50, SvcWrap<CreateSharedMemory>, "CreateSharedMemory"},
-    {0x51, SvcWrap<MapTransferMemory>, "MapTransferMemory"},
-    {0x52, SvcWrap<UnmapTransferMemory>, "UnmapTransferMemory"},
+    {0x50, SvcWrap64<CreateSharedMemory>, "CreateSharedMemory"},
+    {0x51, SvcWrap64<MapTransferMemory>, "MapTransferMemory"},
+    {0x52, SvcWrap64<UnmapTransferMemory>, "UnmapTransferMemory"},
    {0x53, nullptr, "CreateInterruptEvent"},
    {0x54, nullptr, "QueryPhysicalAddress"},
    {0x55, nullptr, "QueryIoMapping"},
@@ -2419,8 +2609,8 @@ static const FunctionDef SVC_Table[] = {
    {0x62, nullptr, "TerminateDebugProcess"},
    {0x63, nullptr, "GetDebugEvent"},
    {0x64, nullptr, "ContinueDebugEvent"},
-    {0x65, SvcWrap<GetProcessList>, "GetProcessList"},
-    {0x66, SvcWrap<GetThreadList>, "GetThreadList"},
+    {0x65, SvcWrap64<GetProcessList>, "GetProcessList"},
+    {0x66, SvcWrap64<GetThreadList>, "GetThreadList"},
    {0x67, nullptr, "GetDebugThreadContext"},
    {0x68, nullptr, "SetDebugThreadContext"},
    {0x69, nullptr, "QueryDebugProcessMemory"},
@@ -2436,24 +2626,32 @@ static const FunctionDef SVC_Table[] = {
    {0x73, nullptr, "SetProcessMemoryPermission"},
    {0x74, nullptr, "MapProcessMemory"},
    {0x75, nullptr, "UnmapProcessMemory"},
-    {0x76, SvcWrap<QueryProcessMemory>, "QueryProcessMemory"},
-    {0x77, SvcWrap<MapProcessCodeMemory>, "MapProcessCodeMemory"},
-    {0x78, SvcWrap<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"},
+    {0x76, SvcWrap64<QueryProcessMemory>, "QueryProcessMemory"},
+    {0x77, SvcWrap64<MapProcessCodeMemory>, "MapProcessCodeMemory"},
+    {0x78, SvcWrap64<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"},
    {0x79, nullptr, "CreateProcess"},
    {0x7A, nullptr, "StartProcess"},
    {0x7B, nullptr, "TerminateProcess"},
-    {0x7C, SvcWrap<GetProcessInfo>, "GetProcessInfo"},
-    {0x7D, SvcWrap<CreateResourceLimit>, "CreateResourceLimit"},
-    {0x7E, SvcWrap<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"},
+    {0x7C, SvcWrap64<GetProcessInfo>, "GetProcessInfo"},
+    {0x7D, SvcWrap64<CreateResourceLimit>, "CreateResourceLimit"},
+    {0x7E, SvcWrap64<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"},
    {0x7F, nullptr, "CallSecureMonitor"},
 };

-static const FunctionDef* GetSVCInfo(u32 func_num) {
-    if (func_num >= std::size(SVC_Table)) {
+static const FunctionDef* GetSVCInfo32(u32 func_num) {
+    if (func_num >= std::size(SVC_Table_32)) {
        LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num);
        return nullptr;
    }
-    return &SVC_Table[func_num];
+    return &SVC_Table_32[func_num];
+}
+
+static const FunctionDef* GetSVCInfo64(u32 func_num) {
+    if (func_num >= std::size(SVC_Table_64)) {
+        LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num);
+        return nullptr;
+    }
+    return &SVC_Table_64[func_num];
 }

 MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
@@ -2464,7 +2662,8 @@ void CallSVC(Core::System& system, u32 immediate) {
    // Lock the global kernel mutex when we enter the kernel HLE.
    std::lock_guard lock{HLE::g_hle_lock};

-    const FunctionDef* info = GetSVCInfo(immediate);
+    const FunctionDef* info = system.CurrentProcess()->Is64BitProcess() ? GetSVCInfo64(immediate)
+                                                                        : GetSVCInfo32(immediate);
    if (info) {
        if (info->func) {
            info->func(system);
@@ -15,6 +15,10 @@ static inline u64 Param(const Core::System& system, int n) {
    return system.CurrentArmInterface().GetReg(n);
 }

+static inline u32 Param32(const Core::System& system, int n) {
+    return static_cast<u32>(system.CurrentArmInterface().GetReg(n));
+}
+
 /**
 * HLE a function return from the current ARM userland process
 * @param system System context
@@ -24,40 +28,44 @@ static inline void FuncReturn(Core::System& system, u64 result) {
    system.CurrentArmInterface().SetReg(0, result);
 }

+static inline void FuncReturn32(Core::System& system, u32 result) {
+    system.CurrentArmInterface().SetReg(0, (u64)result);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Function wrappers that return type ResultCode

 template <ResultCode func(Core::System&, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0)).raw);
 }

 template <ResultCode func(Core::System&, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw);
 }

 template <ResultCode func(Core::System&, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
 }

 template <ResultCode func(Core::System&, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1))).raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            Param(system, 2), Param(system, 3))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param = 0;
    const u32 retval = func(system, &param).raw;
    system.CurrentArmInterface().SetReg(1, param);
@@ -65,7 +73,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1))).raw;
    system.CurrentArmInterface().SetReg(1, param_1);
@@ -73,7 +81,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u32*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    u32 param_2 = 0;
    const u32 retval = func(system, &param_1, &param_2).raw;
@@ -86,7 +94,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1)).raw;
    system.CurrentArmInterface().SetReg(1, param_1);
@@ -94,7 +102,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval =
        func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2))).raw;
@@ -104,7 +112,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64*, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1))).raw;

@@ -113,12 +121,12 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1))).raw);
 }

 template <ResultCode func(Core::System&, u64*, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1)).raw;

@@ -127,7 +135,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64*, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1)),
                            static_cast<u32>(Param(system, 2)))
@@ -138,19 +146,19 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1)).raw);
 }

 template <ResultCode func(Core::System&, u32, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)),
                            static_cast<u32>(Param(system, 1)), Param(system, 2))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u32*, u64*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    u64 param_2 = 0;
    const ResultCode retval = func(system, static_cast<u32>(Param(system, 2)), &param_1, &param_2);
@@ -161,54 +169,54 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u64, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), static_cast<u32>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), Param(system, 3))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            static_cast<u32>(Param(system, 2)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1), Param(system, 2)).raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2))).raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            Param(system, 2), static_cast<u32>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2)).raw);
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2)),
                            static_cast<s64>(Param(system, 3)))
@@ -219,14 +227,14 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u64, u32, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), static_cast<s64>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64*, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval =
        func(system, &param_1, Param(system, 1), Param(system, 2), Param(system, 3)).raw;
@@ -236,7 +244,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, u64, u32, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), Param(system, 2), Param(system, 3),
                            static_cast<u32>(Param(system, 4)), static_cast<s32>(Param(system, 5)))
@@ -247,7 +255,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), Param(system, 2),
                            static_cast<u32>(Param(system, 3)))
@@ -258,7 +266,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, Handle*, u64, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2)),
                            static_cast<u32>(Param(system, 3)))
@@ -269,14 +277,14 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u32, s32, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)),
                            static_cast<s32>(Param(system, 2)), static_cast<s64>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u32, s32, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)),
                            static_cast<s32>(Param(system, 2)), static_cast<s32>(Param(system, 3)))
                           .raw);
@@ -286,7 +294,7 @@ void SvcWrap(Core::System& system) {
 // Function wrappers that return type u32

 template <u32 func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system));
 }

@@ -294,7 +302,7 @@ void SvcWrap(Core::System& system) {
 // Function wrappers that return type u64

 template <u64 func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system));
 }

@@ -302,44 +310,110 @@ void SvcWrap(Core::System& system) {
 /// Function wrappers that return type void

 template <void func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system);
 }

 template <void func(Core::System&, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)));
 }

 template <void func(Core::System&, u32, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2),
         Param(system, 3));
 }

 template <void func(Core::System&, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<s64>(Param(system, 0)));
 }

 template <void func(Core::System&, u64, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), static_cast<s32>(Param(system, 1)));
 }

 template <void func(Core::System&, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), Param(system, 1));
 }

 template <void func(Core::System&, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), Param(system, 1), Param(system, 2));
 }

 template <void func(Core::System&, u32, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2));
 }

+// Used by QueryMemory32
+template <ResultCode func(Core::System&, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    FuncReturn32(system,
+                 func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2)).raw);
+}
+
+// Used by GetInfo32
+template <ResultCode func(Core::System&, u32*, u32*, u32, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+
+    const u32 retval = func(system, &param_1, &param_2, Param32(system, 0), Param32(system, 1),
+                            Param32(system, 2), Param32(system, 3))
+                           .raw;
+
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    FuncReturn(system, retval);
+}
+
+// Used by GetThreadPriority32, ConnectToNamedPort32
+template <ResultCode func(Core::System&, u32*, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    const u32 retval = func(system, &param_1, Param32(system, 1)).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    FuncReturn(system, retval);
+}
+
+// Used by GetThreadId32
+template <ResultCode func(Core::System&, u32*, u32*, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+
+    const u32 retval = func(system, &param_1, &param_2, Param32(system, 1)).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    FuncReturn(system, retval);
+}
+
+// Used by SignalProcessWideKey32
+template <void func(Core::System&, u32, s32)>
+void SvcWrap32(Core::System& system) {
+    func(system, static_cast<u32>(Param(system, 0)), static_cast<s32>(Param(system, 1)));
+}
+
+// Used by SendSyncRequest32
+template <ResultCode func(Core::System&, u32)>
+void SvcWrap32(Core::System& system) {
+    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
+}
+
+// Used by WaitSynchronization32
+template <ResultCode func(Core::System&, u32, u32, s32, u32, Handle*)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    const u32 retval = func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2),
+                            Param32(system, 3), &param_1)
+                           .raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    FuncReturn(system, retval);
+}
+
 } // namespace Kernel
@@ -46,9 +46,9 @@ Thread::~Thread() = default;
 void Thread::Stop() {
    // Cancel any outstanding wakeup events for this thread
    Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             callback_handle);
-    kernel.ThreadWakeupCallbackHandleTable().Close(callback_handle);
-    callback_handle = 0;
+                                                             global_handle);
+    kernel.GlobalHandleTable().Close(global_handle);
+    global_handle = 0;
    SetStatus(ThreadStatus::Dead);
    Signal();

@@ -73,12 +73,12 @@ void Thread::WakeAfterDelay(s64 nanoseconds) {
    // thread-safe version of ScheduleEvent.
    const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
    Core::System::GetInstance().CoreTiming().ScheduleEvent(
-        cycles, kernel.ThreadWakeupCallbackEventType(), callback_handle);
+        cycles, kernel.ThreadWakeupCallbackEventType(), global_handle);
 }

 void Thread::CancelWakeupTimer() {
    Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             callback_handle);
+                                                             global_handle);
 }

 void Thread::ResumeFromWait() {
@@ -133,15 +133,16 @@ void Thread::CancelWait() {
    ResumeFromWait();
 }

-/**
- * Resets a thread context, making it ready to be scheduled and run by the CPU
- * @param context Thread context to reset
- * @param stack_top Address of the top of the stack
- * @param entry_point Address of entry point for execution
- * @param arg User argument for thread
- */
-static void ResetThreadContext(Core::ARM_Interface::ThreadContext& context, VAddr stack_top,
-                               VAddr entry_point, u64 arg) {
+static void ResetThreadContext32(Core::ARM_Interface::ThreadContext32& context, u32 stack_top,
+                                 u32 entry_point, u32 arg) {
+    context = {};
+    context.cpu_registers[0] = arg;
+    context.cpu_registers[15] = entry_point;
+    context.cpu_registers[13] = stack_top;
+}
+
+static void ResetThreadContext64(Core::ARM_Interface::ThreadContext64& context, VAddr stack_top,
+                                 VAddr entry_point, u64 arg) {
    context = {};
    context.cpu_registers[0] = arg;
    context.pc = entry_point;
@@ -190,7 +191,7 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
    thread->condvar_wait_address = 0;
    thread->wait_handle = 0;
    thread->name = std::move(name);
-    thread->callback_handle = kernel.ThreadWakeupCallbackHandleTable().Create(thread).Unwrap();
+    thread->global_handle = kernel.GlobalHandleTable().Create(thread).Unwrap();
    thread->owner_process = &owner_process;
    auto& scheduler = kernel.GlobalScheduler();
    scheduler.AddThread(thread);
@@ -198,9 +199,9 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin

    thread->owner_process->RegisterThread(thread.get());

-    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
-    // to initialize the context
-    ResetThreadContext(thread->context, stack_top, entry_point, arg);
+    ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
+                         static_cast<u32>(entry_point), static_cast<u32>(arg));
+    ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);

    return MakeResult<std::shared_ptr<Thread>>(std::move(thread));
 }
@@ -213,11 +214,13 @@ void Thread::SetPriority(u32 priority) {
 }

 void Thread::SetWaitSynchronizationResult(ResultCode result) {
-    context.cpu_registers[0] = result.raw;
+    context_32.cpu_registers[0] = result.raw;
+    context_64.cpu_registers[0] = result.raw;
 }

 void Thread::SetWaitSynchronizationOutput(s32 output) {
-    context.cpu_registers[1] = output;
+    context_32.cpu_registers[1] = output;
+    context_64.cpu_registers[1] = output;
 }

 s32 Thread::GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const {
@@ -102,7 +102,8 @@ public:

    using MutexWaitingThreads = std::vector<std::shared_ptr<Thread>>;

-    using ThreadContext = Core::ARM_Interface::ThreadContext;
+    using ThreadContext32 = Core::ARM_Interface::ThreadContext32;
+    using ThreadContext64 = Core::ARM_Interface::ThreadContext64;

    using ThreadSynchronizationObjects = std::vector<std::shared_ptr<SynchronizationObject>>;

@@ -273,12 +274,20 @@ public:
        return status == ThreadStatus::WaitSynch;
    }

-    ThreadContext& GetContext() {
-        return context;
+    ThreadContext32& GetContext32() {
+        return context_32;
    }

-    const ThreadContext& GetContext() const {
-        return context;
+    const ThreadContext32& GetContext32() const {
+        return context_32;
+    }
+
+    ThreadContext64& GetContext64() {
+        return context_64;
+    }
+
+    const ThreadContext64& GetContext64() const {
+        return context_64;
    }

    ThreadStatus GetStatus() const {
@@ -453,6 +462,10 @@ public:
        is_sync_cancelled = value;
    }

+    Handle GetGlobalHandle() const {
+        return global_handle;
+    }
+
 private:
    void SetSchedulingStatus(ThreadSchedStatus new_status);
    void SetCurrentPriority(u32 new_priority);
@@ -462,7 +475,8 @@ private:
    void AdjustSchedulingOnPriority(u32 old_priority);
    void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core);

-    Core::ARM_Interface::ThreadContext context{};
+    ThreadContext32 context_32{};
+    ThreadContext64 context_64{};

    u64 thread_id = 0;

@@ -514,7 +528,7 @@ private:
    VAddr arb_wait_address{0};

    /// Handle used as userdata to reference this object when inserting into the CoreTiming queue.
-    Handle callback_handle = 0;
+    Handle global_handle = 0;

    /// Callback that will be invoked when the thread is resumed from a waiting state. If the thread
    /// was waiting via WaitSynchronization then the object will be the last object that became
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
+#include "core/hle/kernel/handle_table.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
+
+namespace Kernel {
+
+TimeManager::TimeManager(Core::System& system) : system{system} {
+    time_manager_event_type = Core::Timing::CreateEvent(
+        "Kernel::TimeManagerCallback", [this](u64 thread_handle, [[maybe_unused]] s64 cycles_late) {
+            Handle proper_handle = static_cast<Handle>(thread_handle);
+            std::shared_ptr<Thread> thread =
+                this->system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
+            thread->ResumeFromWait();
+        });
+}
+
+void TimeManager::ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds) {
+    if (nanoseconds > 0) {
+        ASSERT(timetask);
+        event_handle = timetask->GetGlobalHandle();
+        const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
+        system.CoreTiming().ScheduleEvent(cycles, time_manager_event_type, event_handle);
+    } else {
+        event_handle = InvalidHandle;
+    }
+}
+
+void TimeManager::UnscheduleTimeEvent(Handle event_handle) {
+    if (event_handle == InvalidHandle) {
+        return;
+    }
+    system.CoreTiming().UnscheduleEvent(time_manager_event_type, event_handle);
+}
+
+} // namespace Kernel
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+#include "core/hle/kernel/object.h"
+
+namespace Core {
+class System;
+} // namespace Core
+
+namespace Core::Timing {
+struct EventType;
+} // namespace Core::Timing
+
+namespace Kernel {
+
+class Thread;
+
+/**
+ * The `TimeManager` takes care of scheduling time events on threads and executes their TimeUp
+ * method when the event is triggered.
+ */
+class TimeManager {
+public:
+    explicit TimeManager(Core::System& system);
+
+    /// Schedule a time event on `timetask` thread that will expire in 'nanoseconds'
+    /// returns a non-invalid handle in `event_handle` if correctly scheduled
+    void ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds);
+
+    /// Unschedule an existing time event
+    void UnscheduleTimeEvent(Handle event_handle);
+
+private:
+    Core::System& system;
+    std::shared_ptr<Core::Timing::EventType> time_manager_event_type;
+};
+
+} // namespace Kernel
@@ -607,7 +607,7 @@ ICommonStateGetter::ICommonStateGetter(Core::System& system,
        {40, nullptr, "GetCradleFwVersion"},
        {50, nullptr, "IsVrModeEnabled"},
        {51, nullptr, "SetVrModeEnabled"},
-        {52, nullptr, "SwitchLcdBacklight"},
+        {52, &ICommonStateGetter::SetLcdBacklighOffEnabled, "SetLcdBacklighOffEnabled"},
        {53, nullptr, "BeginVrModeEx"},
        {54, nullptr, "EndVrModeEx"},
        {55, nullptr, "IsInControllerFirmwareUpdateSection"},
@@ -636,7 +636,6 @@ void ICommonStateGetter::GetBootMode(Kernel::HLERequestContext& ctx) {

    IPC::ResponseBuilder rb{ctx, 3};
    rb.Push(RESULT_SUCCESS);
-
    rb.Push<u8>(static_cast<u8>(Service::PM::SystemBootMode::Normal)); // Normal boot mode
 }

@@ -660,6 +659,7 @@ void ICommonStateGetter::ReceiveMessage(Kernel::HLERequestContext& ctx) {
        rb.PushEnum<AppletMessageQueue::AppletMessage>(message);
        return;
    }
+
    rb.Push(RESULT_SUCCESS);
    rb.PushEnum<AppletMessageQueue::AppletMessage>(message);
 }
@@ -672,6 +672,17 @@ void ICommonStateGetter::GetCurrentFocusState(Kernel::HLERequestContext& ctx) {
    rb.Push(static_cast<u8>(FocusState::InFocus));
 }

+void ICommonStateGetter::SetLcdBacklighOffEnabled(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto is_lcd_backlight_off_enabled = rp.Pop<bool>();
+
+    LOG_WARNING(Service_AM, "(STUBBED) called. is_lcd_backlight_off_enabled={}",
+                is_lcd_backlight_off_enabled);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void ICommonStateGetter::GetDefaultDisplayResolutionChangeEvent(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_AM, "called");

@@ -182,6 +182,7 @@ private:
    void GetOperationMode(Kernel::HLERequestContext& ctx);
    void GetPerformanceMode(Kernel::HLERequestContext& ctx);
    void GetBootMode(Kernel::HLERequestContext& ctx);
+    void SetLcdBacklighOffEnabled(Kernel::HLERequestContext& ctx);
    void GetDefaultDisplayResolution(Kernel::HLERequestContext& ctx);
    void SetCpuBoostMode(Kernel::HLERequestContext& ctx);

@@ -200,7 +200,8 @@ private:
    DownloadResult DownloadInternal(const std::string& resolved_path, u32 timeout_seconds,
                                    const std::string& content_type_name) {
        if (client == nullptr) {
-            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT, timeout_seconds);
+            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT);
+            client->set_timeout_sec(timeout_seconds);
        }

        httplib::Headers headers{
@@ -448,8 +449,8 @@ std::optional<std::vector<u8>> Boxcat::GetLaunchParameter(TitleIDVersion title)

 Boxcat::StatusResult Boxcat::GetStatus(std::optional<std::string>& global,
                                       std::map<std::string, EventStatus>& games) {
-    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT),
-                              static_cast<int>(TIMEOUT_SECONDS)};
+    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT)};
+    client.set_timeout_sec(static_cast<int>(TIMEOUT_SECONDS));

    httplib::Headers headers{
        {std::string("Game-Assets-API-Version"), std::string(BOXCAT_API_VERSION)},
@@ -287,13 +287,13 @@ void Controller_NPad::RequestPadStateUpdate(u32 npad_id) {
        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus(
            Input::AnalogDirection::DOWN));

-    pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
-                                    ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT));
-    pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
-                                      ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT));
    pad_state.r_stick_right.Assign(
        analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
-            ->GetAnalogDirectionStatus(Input::AnalogDirection::UP));
+            ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT));
+    pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+                                      ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT));
+    pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
+                                    ->GetAnalogDirectionStatus(Input::AnalogDirection::UP));
    pad_state.r_stick_down.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)]
                                      ->GetAnalogDirectionStatus(Input::AnalogDirection::DOWN));

@@ -129,12 +129,20 @@ public:
            {304, nullptr, "Disconnect"},
            {400, nullptr, "Initialize"},
            {401, nullptr, "Finalize"},
-            {402, nullptr, "SetOperationMode"},
+            {402, &IUserLocalCommunicationService::Initialize2, "Initialize2"}, // 7.0.0+
        };
        // clang-format on

        RegisterHandlers(functions);
    }
+
+    void Initialize2(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_LDN, "(STUBBED) called");
+        // Result success seem make this services start network and continue.
+        // If we just pass result error then it will stop and maybe try again and again.
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_UNKNOWN);
+    }
 };

 class LDNS final : public ServiceFramework<LDNS> {
@@ -44,6 +44,8 @@ u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
        return GetWaitbase(input, output);
    case IoctlCommand::IocChannelSetTimeoutCommand:
        return ChannelSetTimeout(input, output);
+    case IoctlCommand::IocChannelSetTimeslice:
+        return ChannelSetTimeslice(input, output);
    default:
        break;
    }
@@ -228,4 +230,14 @@ u32 nvhost_gpu::ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>&
    return 0;
 }

+u32 nvhost_gpu::ChannelSetTimeslice(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSetTimeslice params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSetTimeslice));
+    LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice);
+
+    channel_timeslice = params.timeslice;
+
+    return 0;
+}
+
 } // namespace Service::Nvidia::Devices
@@ -48,6 +48,7 @@ private:
        IocAllocObjCtxCommand = 0xC0104809,
        IocChannelGetWaitbaseCommand = 0xC0080003,
        IocChannelSetTimeoutCommand = 0x40044803,
+        IocChannelSetTimeslice = 0xC004481D,
    };

    enum class CtxObjects : u32_le {
@@ -101,6 +102,11 @@ private:
    static_assert(sizeof(IoctlChannelSetPriority) == 4,
                  "IoctlChannelSetPriority is incorrect size");

+    struct IoctlSetTimeslice {
+        u32_le timeslice;
+    };
+    static_assert(sizeof(IoctlSetTimeslice) == 4, "IoctlSetTimeslice is incorrect size");
+
    struct IoctlEventIdControl {
        u32_le cmd; // 0=disable, 1=enable, 2=clear
        u32_le id;
@@ -174,6 +180,7 @@ private:
    u64_le user_data{};
    IoctlZCullBind zcull_params{};
    u32_le channel_priority{};
+    u32_le channel_timeslice{};

    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
    u32 SetClientData(const std::vector<u8>& input, std::vector<u8>& output);
@@ -188,6 +195,7 @@ private:
                  const std::vector<u8>& input2, IoctlVersion version);
    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
    u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 ChannelSetTimeslice(const std::vector<u8>& input, std::vector<u8>& output);

    std::shared_ptr<nvmap> nvmap_dev;
    u32 assigned_syncpoints{};
@@ -129,12 +129,6 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    }
    metadata.Print();

-    const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()};
-    if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit ||
-        arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) {
-        return {ResultStatus::Error32BitISA, {}};
-    }
-
    if (process.LoadFromMetadata(metadata).IsError()) {
        return {ResultStatus::ErrorUnableToParseKernelMetadata, {}};
    }
@@ -111,7 +111,7 @@ json GetProcessorStateDataAuto(Core::System& system) {
    const auto& vm_manager{process->VMManager()};
    auto& arm{system.CurrentArmInterface()};

-    Core::ARM_Interface::ThreadContext context{};
+    Core::ARM_Interface::ThreadContext64 context{};
    arm.SaveContext(context);

    return GetProcessorStateData(process->Is64BitProcess() ? "AArch64" : "AArch32",
@@ -94,6 +94,7 @@ void LogSettings() {
    LogSetting("Renderer_UseAccurateGpuEmulation", Settings::values.use_accurate_gpu_emulation);
    LogSetting("Renderer_UseAsynchronousGpuEmulation",
               Settings::values.use_asynchronous_gpu_emulation);
+    LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
    LogSetting("Audio_OutputEngine", Settings::values.sink_id);
    LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
    LogSetting("Audio_OutputDevice", Settings::values.audio_device_id);
@@ -429,11 +429,13 @@ struct Values {
    int vulkan_device;

    float resolution_factor;
+    int aspect_ratio;
    bool use_frame_limit;
    u16 frame_limit;
    bool use_disk_shader_cache;
    bool use_accurate_gpu_emulation;
    bool use_asynchronous_gpu_emulation;
+    bool use_vsync;
    bool force_30fps_mode;

    float bg_red;
@@ -188,6 +188,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
             Settings::values.use_accurate_gpu_emulation);
    AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation);
+    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
    AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
 }

@@ -34,6 +34,20 @@ public:
                               y * coef * (x == 0 ? 1.0f : SQRT_HALF));
    }

+    bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override {
+        switch (direction) {
+        case Input::AnalogDirection::RIGHT:
+            return right->GetStatus();
+        case Input::AnalogDirection::LEFT:
+            return left->GetStatus();
+        case Input::AnalogDirection::UP:
+            return up->GetStatus();
+        case Input::AnalogDirection::DOWN:
+            return down->GetStatus();
+        }
+        return false;
+    }
+
 private:
    Button up;
    Button down;
@@ -32,8 +32,16 @@ public:
                    SocketCallback callback)
        : callback(std::move(callback)), timer(io_service),
          socket(io_service, udp::endpoint(udp::v4(), 0)), client_id(client_id),
-          pad_index(pad_index),
-          send_endpoint(udp::endpoint(boost::asio::ip::make_address_v4(host), port)) {}
+          pad_index(pad_index) {
+        boost::system::error_code ec{};
+        auto ipv4 = boost::asio::ip::make_address_v4(host, ec);
+        if (ec.failed()) {
+            LOG_ERROR(Input, "Invalid IPv4 address \"{}\" provided to socket", host);
+            ipv4 = boost::asio::ip::address_v4{};
+        }
+
+        send_endpoint = {udp::endpoint(ipv4, port)};
+    }

    void Stop() {
        io_service.stop();
@@ -85,17 +93,18 @@ private:
    }

    void HandleSend(const boost::system::error_code& error) {
+        boost::system::error_code _ignored{};
        // Send a request for getting port info for the pad
        Request::PortInfo port_info{1, {pad_index, 0, 0, 0}};
        const auto port_message = Request::Create(port_info, client_id);
        std::memcpy(&send_buffer1, &port_message, PORT_INFO_SIZE);
-        socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint);
+        socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint, {}, _ignored);

        // Send a request for getting pad data for the pad
        Request::PadData pad_data{Request::PadData::Flags::Id, pad_index, EMPTY_MAC_ADDRESS};
        const auto pad_message = Request::Create(pad_data, client_id);
        std::memcpy(send_buffer2.data(), &pad_message, PAD_DATA_SIZE);
-        socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint);
+        socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint, {}, _ignored);
        StartSend(timer.expiry());
    }

@@ -31,7 +31,6 @@ namespace Response {
 */
 std::optional<Type> Validate(u8* data, std::size_t size) {
    if (size < sizeof(Header)) {
-        LOG_DEBUG(Input, "Invalid UDP packet received");
        return std::nullopt;
    }
    Header header{};
@@ -37,6 +37,7 @@ add_library(video_core STATIC
    memory_manager.h
    morton.cpp
    morton.h
+    query_cache.h
    rasterizer_accelerated.cpp
    rasterizer_accelerated.h
    rasterizer_cache.cpp
@@ -74,6 +75,8 @@ add_library(video_core STATIC
    renderer_opengl/gl_stream_buffer.h
    renderer_opengl/gl_texture_cache.cpp
    renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_query_cache.cpp
+    renderer_opengl/gl_query_cache.h
    renderer_opengl/maxwell_to_gl.h
    renderer_opengl/renderer_opengl.cpp
    renderer_opengl/renderer_opengl.h
@@ -177,6 +180,8 @@ if (ENABLE_VULKAN)
        renderer_vulkan/vk_memory_manager.h
        renderer_vulkan/vk_pipeline_cache.cpp
        renderer_vulkan/vk_pipeline_cache.h
+        renderer_vulkan/vk_query_cache.cpp
+        renderer_vulkan/vk_query_cache.h
        renderer_vulkan/vk_rasterizer.cpp
        renderer_vulkan/vk_rasterizer.h
        renderer_vulkan/vk_renderpass_cache.cpp
@@ -4,6 +4,7 @@

 #include <cinttypes>
 #include <cstring>
+#include <optional>
 #include "common/assert.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -16,6 +17,8 @@

 namespace Tegra::Engines {

+using VideoCore::QueryType;
+
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;

@@ -400,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        ProcessQueryCondition();
        break;
    }
+    case MAXWELL3D_REG_INDEX(counter_reset): {
+        ProcessCounterReset();
+        break;
+    }
    case MAXWELL3D_REG_INDEX(sync_info): {
        ProcessSyncPoint();
        break;
@@ -482,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() {

    const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
    if (ShouldExecute()) {
-        rasterizer.DrawMultiBatch(is_indexed);
+        rasterizer.Draw(is_indexed, true);
    }

    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -544,40 +551,28 @@ void Maxwell3D::ProcessQueryGet() {
               "Units other than CROP are unimplemented");

    switch (regs.query.query_get.operation) {
-    case Regs::QueryOperation::Release: {
-        const u64 result = regs.query.query_sequence;
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
+    case Regs::QueryOperation::Release:
+        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
        break;
-    }
-    case Regs::QueryOperation::Acquire: {
-        // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU
-        // to write a value that matches the current payload.
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
        UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
        break;
-    }
-    case Regs::QueryOperation::Counter: {
-        u64 result{};
-        switch (regs.query.query_get.select) {
-        case Regs::QuerySelect::Zero:
-            result = 0;
-            break;
-        default:
-            result = 1;
-            UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                              static_cast<u32>(regs.query.query_get.select.Value()));
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
        }
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
        break;
-    }
-    case Regs::QueryOperation::Trap: {
+    case Regs::QueryOperation::Trap:
        UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
        break;
-    }
-    default: {
+    default:
        UNIMPLEMENTED_MSG("Unknown query operation");
        break;
    }
-    }
 }

 void Maxwell3D::ProcessQueryCondition() {
@@ -593,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() {
    }
    case Regs::ConditionMode::ResNonZero: {
        Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
        break;
    }
    case Regs::ConditionMode::Equal: {
        Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
        execute_on =
            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
        break;
    }
    case Regs::ConditionMode::NotEqual: {
        Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
        execute_on =
            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
        break;
@@ -619,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() {
    }
 }

+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer.ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
+                    static_cast<int>(regs.counter_reset));
+        break;
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
    const u32 increment = regs.sync_info.increment.Value();
@@ -647,7 +654,7 @@ void Maxwell3D::DrawArrays() {

    const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
    if (ShouldExecute()) {
-        rasterizer.DrawBatch(is_indexed);
+        rasterizer.Draw(is_indexed, false);
    }

    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -661,6 +668,22 @@ void Maxwell3D::DrawArrays() {
    }
 }

+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+                         system.GPU().GetTicks());
+        return {};
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
+                          static_cast<u32>(regs.query.query_get.select.Value()));
+        return 1;
+    }
+}
+
 void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
    auto& shader = state.shader_stages[stage_index];
@@ -6,6 +6,7 @@

 #include <array>
 #include <bitset>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
@@ -409,6 +410,27 @@ public:
            Linear = 1,
        };

+        enum class CounterReset : u32 {
+            SampleCnt = 0x01,
+            Unk02 = 0x02,
+            Unk03 = 0x03,
+            Unk04 = 0x04,
+            EmittedPrimitives = 0x10, // Not tested
+            Unk11 = 0x11,
+            Unk12 = 0x12,
+            Unk13 = 0x13,
+            Unk15 = 0x15,
+            Unk16 = 0x16,
+            Unk17 = 0x17,
+            Unk18 = 0x18,
+            Unk1A = 0x1A,
+            Unk1B = 0x1B,
+            Unk1C = 0x1C,
+            Unk1D = 0x1D,
+            Unk1E = 0x1E,
+            GeneratedPrimitives = 0x1F,
+        };
+
        struct Cull {
            enum class FrontFace : u32 {
                ClockWise = 0x0900,
@@ -520,7 +542,7 @@ public:
                BitField<12, 1, InvMemoryLayout> type;
            } memory_layout;
            union {
-                BitField<0, 16, u32> array_mode;
+                BitField<0, 16, u32> layers;
                BitField<16, 1, u32> volume;
            };
            u32 layer_stride;
@@ -778,8 +800,12 @@ public:

                u32 zeta_width;
                u32 zeta_height;
+                union {
+                    BitField<0, 16, u32> zeta_layers;
+                    BitField<16, 1, u32> zeta_volume;
+                };

-                INSERT_UNION_PADDING_WORDS(0x27);
+                INSERT_UNION_PADDING_WORDS(0x26);

                u32 depth_test_enable;

@@ -857,7 +883,7 @@ public:
                    BitField<7, 1, u32> c7;
                } clip_distance_enabled;

-                INSERT_UNION_PADDING_WORDS(0x1);
+                u32 samplecnt_enable;

                float point_size;

@@ -865,7 +891,11 @@ public:

                u32 point_sprite_enable;

-                INSERT_UNION_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                CounterReset counter_reset;
+
+                INSERT_UNION_PADDING_WORDS(0x1);

                u32 zeta_enable;

@@ -1412,12 +1442,15 @@ private:
    /// Handles a write to the QUERY_GET register.
    void ProcessQueryGet();

-    // Writes the query result accordingly
+    /// Writes the query result accordingly.
    void StampQueryResult(u64 payload, bool long_query);

-    // Handles Conditional Rendering
+    /// Handles conditional rendering.
    void ProcessQueryCondition();

+    /// Handles counter resets.
+    void ProcessCounterReset();
+
    /// Handles writes to syncing register.
    void ProcessSyncPoint();

@@ -1434,6 +1467,9 @@ private:

    // Handles a instance drawcall from MME
    void StepInstance(MMEDrawMode expected_mode, u32 count);
+
+    /// Returns a query's value or an empty object if the value will be deferred through a cache.
+    std::optional<u64> GetQueryResult();
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -1475,6 +1511,7 @@ ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
+ASSERT_REG_POSITION(zeta_layers, 0x48c);
 ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
 ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@@ -1499,8 +1536,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
+ASSERT_REG_POSITION(samplecnt_enable, 0x545);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(point_sprite_enable, 0x548);
+ASSERT_REG_POSITION(counter_reset, 0x54C);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
 ASSERT_REG_POSITION(condition, 0x554);
@@ -24,7 +24,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
    : system{system}, renderer{renderer}, is_async{is_async} {
    auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(system);
+    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
@@ -140,71 +140,6 @@ void GPU::FlushCommands() {
    renderer.Rasterizer().FlushCommands();
 }

-u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
-    ASSERT(format != RenderTargetFormat::NONE);
-
-    switch (format) {
-    case RenderTargetFormat::RGBA32_FLOAT:
-    case RenderTargetFormat::RGBA32_UINT:
-        return 16;
-    case RenderTargetFormat::RGBA16_UINT:
-    case RenderTargetFormat::RGBA16_UNORM:
-    case RenderTargetFormat::RGBA16_FLOAT:
-    case RenderTargetFormat::RGBX16_FLOAT:
-    case RenderTargetFormat::RG32_FLOAT:
-    case RenderTargetFormat::RG32_UINT:
-        return 8;
-    case RenderTargetFormat::RGBA8_UNORM:
-    case RenderTargetFormat::RGBA8_SNORM:
-    case RenderTargetFormat::RGBA8_SRGB:
-    case RenderTargetFormat::RGBA8_UINT:
-    case RenderTargetFormat::RGB10_A2_UNORM:
-    case RenderTargetFormat::BGRA8_UNORM:
-    case RenderTargetFormat::BGRA8_SRGB:
-    case RenderTargetFormat::RG16_UNORM:
-    case RenderTargetFormat::RG16_SNORM:
-    case RenderTargetFormat::RG16_UINT:
-    case RenderTargetFormat::RG16_SINT:
-    case RenderTargetFormat::RG16_FLOAT:
-    case RenderTargetFormat::R32_FLOAT:
-    case RenderTargetFormat::R11G11B10_FLOAT:
-    case RenderTargetFormat::R32_UINT:
-        return 4;
-    case RenderTargetFormat::R16_UNORM:
-    case RenderTargetFormat::R16_SNORM:
-    case RenderTargetFormat::R16_UINT:
-    case RenderTargetFormat::R16_SINT:
-    case RenderTargetFormat::R16_FLOAT:
-    case RenderTargetFormat::RG8_UNORM:
-    case RenderTargetFormat::RG8_SNORM:
-        return 2;
-    case RenderTargetFormat::R8_UNORM:
-    case RenderTargetFormat::R8_UINT:
-        return 1;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format));
-        return 1;
-    }
-}
-
-u32 DepthFormatBytesPerPixel(DepthFormat format) {
-    switch (format) {
-    case DepthFormat::Z32_S8_X24_FLOAT:
-        return 8;
-    case DepthFormat::Z32_FLOAT:
-    case DepthFormat::S8_Z24_UNORM:
-    case DepthFormat::Z24_X8_UNORM:
-    case DepthFormat::Z24_S8_UNORM:
-    case DepthFormat::Z24_C8_UNORM:
-        return 4;
-    case DepthFormat::Z16_UNORM:
-        return 2;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format));
-        return 1;
-    }
-}
-
 // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
 // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
 // So the values you see in docs might be multiplied by 4.
@@ -57,6 +57,7 @@ enum class RenderTargetFormat : u32 {
    RG16_UINT = 0xDD,
    RG16_FLOAT = 0xDE,
    R11G11B10_FLOAT = 0xE0,
+    R32_SINT = 0xE3,
    R32_UINT = 0xE4,
    R32_FLOAT = 0xE5,
    B5G6R5_UNORM = 0xE8,
@@ -82,12 +83,6 @@ enum class DepthFormat : u32 {
    Z32_S8_X24_FLOAT = 0x19,
 };

-/// Returns the number of bytes per pixel of each rendertarget format.
-u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
-
-/// Returns the number of bytes per pixel of each depth format.
-u32 DepthFormatBytesPerPixel(DepthFormat format);
-
 struct CommandListHeader;
 class DebugContext;

@@ -5,7 +5,7 @@
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
-#include "core/frontend/scope_acquire_window_context.h"
+#include "core/frontend/scope_acquire_context.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
@@ -27,7 +27,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
        return;
    }

-    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
+    Core::Frontend::ScopeAcquireContext acquire_context{renderer.GetRenderWindow()};

    CommandDataContainer next;
    while (state.is_running) {
@@ -11,10 +11,12 @@
 #include "core/memory.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"

 namespace Tegra {

-MemoryManager::MemoryManager(Core::System& system) : system{system} {
+MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+    : rasterizer{rasterizer}, system{system} {
    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
              Common::PageType::Unmapped);
@@ -83,6 +85,7 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
    ASSERT(cpu_addr);

+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
    system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size);

    UnmapRange(gpu_addr, aligned_size);
@@ -242,7 +245,9 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
        switch (page_table.attributes[page_index]) {
        case Common::PageType::Memory: {
            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
-            system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
            std::memcpy(dest_buffer, src_ptr, copy_amount);
            break;
        }
@@ -292,7 +297,9 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
        switch (page_table.attributes[page_index]) {
        case Common::PageType::Memory: {
            u8* dest_ptr{page_table.pointers[page_index] + page_offset};
-            system.GPU().InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
+            // Invalidate must happen on the rasterizer interface, such that memory is always
+            // synchronous when it is written (even when in asynchronous GPU mode).
+            rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
            std::memcpy(dest_ptr, src_buffer, copy_amount);
            break;
        }
@@ -339,8 +346,10 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::

        switch (page_table.attributes[page_index]) {
        case Common::PageType::Memory: {
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is copied (even when in asynchronous GPU mode).
            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
-            system.GPU().FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
            WriteBlock(dest_addr, src_ptr, copy_amount);
            break;
        }
@@ -10,6 +10,10 @@
 #include "common/common_types.h"
 #include "common/page_table.h"

+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Core {
 class System;
 }
@@ -47,7 +51,7 @@ struct VirtualMemoryArea {

 class MemoryManager final {
 public:
-    explicit MemoryManager(Core::System& system);
+    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
    ~MemoryManager();

    GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -172,6 +176,7 @@ private:

    Common::PageTable page_table{page_bits};
    VMAMap vma_map;
+    VideoCore::RasterizerInterface& rasterizer;

    Core::System& system;
 };
@@ -85,6 +85,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
    MortonCopy<true, PixelFormat::RG32UI>,
    MortonCopy<true, PixelFormat::RGBX16F>,
    MortonCopy<true, PixelFormat::R32UI>,
+    MortonCopy<true, PixelFormat::R32I>,
    MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
    MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
    MortonCopy<true, PixelFormat::ASTC_2D_5X4>,
@@ -166,6 +167,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
    MortonCopy<false, PixelFormat::RG32UI>,
    MortonCopy<false, PixelFormat::RGBX16F>,
    MortonCopy<false, PixelFormat::R32UI>,
+    MortonCopy<false, PixelFormat::R32I>,
    nullptr,
    nullptr,
    nullptr,
@@ -0,0 +1,359 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class QueryCache, class HostCounter>
+class CounterStreamBase {
+public:
+    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
+        : cache{cache}, type{type} {}
+
+    /// Updates the state of the stream, enabling or disabling as needed.
+    void Update(bool enabled) {
+        if (enabled) {
+            Enable();
+        } else {
+            Disable();
+        }
+    }
+
+    /// Resets the stream to zero. It doesn't disable the query after resetting.
+    void Reset() {
+        if (current) {
+            current->EndQuery();
+
+            // Immediately start a new query to avoid disabling its state.
+            current = cache.Counter(nullptr, type);
+        }
+        last = nullptr;
+    }
+
+    /// Returns the current counter slicing as needed.
+    std::shared_ptr<HostCounter> Current() {
+        if (!current) {
+            return nullptr;
+        }
+        current->EndQuery();
+        last = std::move(current);
+        current = cache.Counter(last, type);
+        return last;
+    }
+
+    /// Returns true when the counter stream is enabled.
+    bool IsEnabled() const {
+        return current != nullptr;
+    }
+
+private:
+    /// Enables the stream.
+    void Enable() {
+        if (current) {
+            return;
+        }
+        current = cache.Counter(last, type);
+    }
+
+    // Disables the stream.
+    void Disable() {
+        if (current) {
+            current->EndQuery();
+        }
+        last = std::exchange(current, nullptr);
+    }
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+
+    std::shared_ptr<HostCounter> current;
+    std::shared_ptr<HostCounter> last;
+};
+
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
+          class QueryPool>
+class QueryCacheBase {
+public:
+    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
+                                                      static_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::SamplesPassed}}} {}
+
+    void InvalidateRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    /**
+     * Records a query in GPU mapped memory, potentially marked with a timestamp.
+     * @param gpu_addr  GPU address to flush to when the mapped memory is read.
+     * @param type      Query type, e.g. SamplesPassed.
+     * @param timestamp Timestamp, when empty the flushed query is assumed to be short.
+     */
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
+        std::unique_lock lock{mutex};
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+
+        CachedQuery* query = TryGet(ToCacheAddr(host_ptr));
+        if (!query) {
+            const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
+        }
+
+        query->BindCounter(Stream(type).Current(), timestamp);
+    }
+
+    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
+    void UpdateCounters() {
+        std::unique_lock lock{mutex};
+        const auto& regs = system.GPU().Maxwell3D().regs;
+        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
+    }
+
+    /// Resets a counter to zero. It doesn't disable the query after resetting.
+    void ResetCounter(VideoCore::QueryType type) {
+        std::unique_lock lock{mutex};
+        Stream(type).Reset();
+    }
+
+    /// Disable all active streams. Expected to be called at the end of a command buffer.
+    void DisableStreams() {
+        std::unique_lock lock{mutex};
+        for (auto& stream : streams) {
+            stream.Update(false);
+        }
+    }
+
+    /// Returns a new host counter.
+    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
+                                         VideoCore::QueryType type) {
+        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
+                                             type);
+    }
+
+    /// Returns the counter stream of the specified type.
+    CounterStream& Stream(VideoCore::QueryType type) {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    /// Returns the counter stream of the specified type.
+    const CounterStream& Stream(VideoCore::QueryType type) const {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+protected:
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
+
+private:
+    /// Flushes a memory range to guest memory and removes it from the cache.
+    void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) {
+        const u64 addr_begin = static_cast<u64>(addr);
+        const u64 addr_end = addr_begin + static_cast<u64>(size);
+        const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
+            const u64 cache_begin = query.GetCacheAddr();
+            const u64 cache_end = cache_begin + query.SizeInBytes();
+            return cache_begin < addr_end && addr_begin < cache_end;
+        };
+
+        const u64 page_end = addr_end >> PAGE_SHIFT;
+        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+            const auto& it = cached_queries.find(page);
+            if (it == std::end(cached_queries)) {
+                continue;
+            }
+            auto& contents = it->second;
+            for (auto& query : contents) {
+                if (!in_range(query)) {
+                    continue;
+                }
+                rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1);
+                query.Flush();
+            }
+            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
+                           std::end(contents));
+        }
+    }
+
+    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
+    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
+        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
+        const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT;
+        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
+                                                  host_ptr);
+    }
+
+    /// Tries to a get a cached query. Returns nullptr on failure.
+    CachedQuery* TryGet(CacheAddr addr) {
+        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const auto it = cached_queries.find(page);
+        if (it == std::end(cached_queries)) {
+            return nullptr;
+        }
+        auto& contents = it->second;
+        const auto found =
+            std::find_if(std::begin(contents), std::end(contents),
+                         [addr](auto& query) { return query.GetCacheAddr() == addr; });
+        return found != std::end(contents) ? &*found : nullptr;
+    }
+
+    static constexpr std::uintptr_t PAGE_SIZE = 4096;
+    static constexpr unsigned PAGE_SHIFT = 12;
+
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
+
+    std::recursive_mutex mutex;
+
+    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
+
+    std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+};
+
+template <class QueryCache, class HostCounter>
+class HostCounterBase {
+public:
+    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
+        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
+        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
+        constexpr u64 depth_threshold = 96;
+        if (depth > depth_threshold) {
+            depth = 0;
+            base_result = dependency->Query();
+            dependency = nullptr;
+        }
+    }
+    virtual ~HostCounterBase() = default;
+
+    /// Returns the current value of the query.
+    u64 Query() {
+        if (result) {
+            return *result;
+        }
+
+        u64 value = BlockingQuery() + base_result;
+        if (dependency) {
+            value += dependency->Query();
+            dependency = nullptr;
+        }
+
+        result = value;
+        return *result;
+    }
+
+    /// Returns true when flushing this query will potentially wait.
+    bool WaitPending() const noexcept {
+        return result.has_value();
+    }
+
+    u64 Depth() const noexcept {
+        return depth;
+    }
+
+protected:
+    /// Returns the value of query from the backend API blocking as needed.
+    virtual u64 BlockingQuery() const = 0;
+
+private:
+    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
+    std::optional<u64> result;               ///< Filled with the already returned value.
+    u64 depth;                               ///< Number of nested dependencies.
+    u64 base_result = 0;                     ///< Equivalent to nested dependencies value.
+};
+
+template <class HostCounter>
+class CachedQueryBase {
+public:
+    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
+        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
+    virtual ~CachedQueryBase() = default;
+
+    CachedQueryBase(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase(const CachedQueryBase&) = delete;
+
+    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase& operator=(const CachedQueryBase&) = delete;
+
+    /// Flushes the query to guest memory.
+    virtual void Flush() {
+        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // zero in these cases.
+        const u64 value = counter ? counter->Query() : 0;
+        std::memcpy(host_ptr, &value, sizeof(u64));
+
+        if (timestamp) {
+            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
+        }
+    }
+
+    /// Binds a counter to this query.
+    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
+        if (counter) {
+            // If there's an old counter set it means the query is being rewritten by the game.
+            // To avoid losing the data forever, flush here.
+            Flush();
+        }
+        counter = std::move(counter_);
+        timestamp = timestamp_;
+    }
+
+    VAddr CpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    CacheAddr GetCacheAddr() const noexcept {
+        return ToCacheAddr(host_ptr);
+    }
+
+    u64 SizeInBytes() const noexcept {
+        return SizeInBytes(timestamp.has_value());
+    }
+
+    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
+        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
+    }
+
+protected:
+    /// Returns true when querying the counter may potentially block.
+    bool WaitPending() const noexcept {
+        return counter && counter->WaitPending();
+    }
+
+private:
+    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp.
+    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp.
+    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
+
+    VAddr cpu_addr;                       ///< Guest CPU address.
+    u8* host_ptr;                         ///< Writable host pointer.
+    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
+    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory.
+};
+
+} // namespace VideoCommon
@@ -6,6 +6,7 @@

 #include <atomic>
 #include <functional>
+#include <optional>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
@@ -17,6 +18,11 @@ class MemoryManager;

 namespace VideoCore {

+enum class QueryType {
+    SamplesPassed,
+};
+constexpr std::size_t NumQueryTypes = 1;
+
 enum class LoadCallbackStage {
    Prepare,
    Decompile,
@@ -29,11 +35,8 @@ class RasterizerInterface {
 public:
    virtual ~RasterizerInterface() {}

-    /// Draw the current batch of vertex arrays
-    virtual bool DrawBatch(bool is_indexed) = 0;
-
-    /// Draw the current batch of multiple instances of vertex arrays
-    virtual bool DrawMultiBatch(bool is_indexed) = 0;
+    /// Dispatches a draw invocation
+    virtual void Draw(bool is_indexed, bool is_instanced) = 0;

    /// Clear the current framebuffer
    virtual void Clear() = 0;
@@ -41,6 +44,12 @@ public:
    /// Dispatches a compute shader invocation
    virtual void DispatchCompute(GPUVAddr code_addr) = 0;

+    /// Resets the counter of a query
+    virtual void ResetCounter(QueryType type) = 0;
+
+    /// Records a GPU query and caches it
+    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+
    /// Notify rasterizer that all caches should be flushed to Switch memory
    virtual void FlushAll() = 0;

@@ -35,15 +35,19 @@ public:
    explicit RendererBase(Core::Frontend::EmuWindow& window);
    virtual ~RendererBase();

-    /// Swap buffers (render frame)
-    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
-
    /// Initialize the renderer
    virtual bool Init() = 0;

    /// Shutdown the renderer
    virtual void ShutDown() = 0;

+    /// Finalize rendering the guest frame and draw into the presentation texture
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
+
+    /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer
+    /// specific implementation)
+    virtual void TryPresent(int timeout_ms) = 0;
+
    // Getter/setter functions:
    // ------------------------

@@ -0,0 +1,120 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glad/glad.h>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
+
+namespace OpenGL {
+
+namespace {
+
+constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
+
+constexpr GLenum GetTarget(VideoCore::QueryType type) {
+    return QueryTargets[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
+    : VideoCommon::QueryCacheBase<
+          QueryCache, CachedQuery, CounterStream, HostCounter,
+          std::vector<OGLQuery>>{system,
+                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
+      gl_rasterizer{gl_rasterizer} {}
+
+QueryCache::~QueryCache() = default;
+
+OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) {
+    auto& reserve = query_pools[static_cast<std::size_t>(type)];
+    OGLQuery query;
+    if (reserve.empty()) {
+        query.Create(GetTarget(type));
+        return query;
+    }
+
+    query = std::move(reserve.back());
+    reserve.pop_back();
+    return query;
+}
+
+void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) {
+    query_pools[static_cast<std::size_t>(type)].push_back(std::move(query));
+}
+
+bool QueryCache::AnyCommandQueued() const noexcept {
+    return gl_rasterizer.AnyCommandQueued();
+}
+
+HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)} {
+    glBeginQuery(GetTarget(type), query.handle);
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, std::move(query));
+}
+
+void HostCounter::EndQuery() {
+    if (!cache.AnyCommandQueued()) {
+        // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
+        // having any of these causes a lock. glFlush is considered a command, so we can safely wait
+        // for this. Insert to the OpenGL command stream a flush.
+        glFlush();
+    }
+    glEndQuery(GetTarget(type));
+}
+
+u64 HostCounter::BlockingQuery() const {
+    GLint64 value;
+    glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value);
+    return static_cast<u64>(value);
+}
+
+CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
+    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+
+CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
+    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
+
+CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
+    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
+    cache = rhs.cache;
+    type = rhs.type;
+    return *this;
+}
+
+void CachedQuery::Flush() {
+    // Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
+    // To avoid this disable and re-enable keeping the dependency stream.
+    // But we only have to do this if we have pending waits to be done.
+    auto& stream = cache->Stream(type);
+    const bool slice_counter = WaitPending() && stream.IsEnabled();
+    if (slice_counter) {
+        stream.Update(false);
+    }
+
+    VideoCommon::CachedQueryBase<HostCounter>::Flush();
+
+    if (slice_counter) {
+        stream.Update(true);
+    }
+}
+
+} // namespace OpenGL
@@ -0,0 +1,78 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+class CachedQuery;
+class HostCounter;
+class QueryCache;
+class RasterizerOpenGL;
+
+using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
+
+class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
+                                                            HostCounter, std::vector<OGLQuery>> {
+public:
+    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    ~QueryCache();
+
+    OGLQuery AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, OGLQuery&& query);
+
+    bool AnyCommandQueued() const noexcept;
+
+private:
+    RasterizerOpenGL& gl_rasterizer;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
+public:
+    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+    OGLQuery query;
+};
+
+class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
+                         u8* host_ptr);
+    CachedQuery(CachedQuery&& rhs) noexcept;
+    CachedQuery(const CachedQuery&) = delete;
+
+    CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+    CachedQuery& operator=(const CachedQuery&) = delete;
+
+    void Flush() override;
+
+private:
+    QueryCache* cache;
+    VideoCore::QueryType type;
+};
+
+} // namespace OpenGL
@@ -25,6 +25,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -35,6 +36,7 @@ namespace OpenGL {

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;

+using Tegra::Engines::ShaderType;
 using VideoCore::Surface::PixelFormat;
 using VideoCore::Surface::SurfaceTarget;
 using VideoCore::Surface::SurfaceType;
@@ -55,8 +57,7 @@ namespace {

 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               Tegra::Engines::ShaderType shader_type,
-                                               std::size_t index = 0) {
+                                               ShaderType shader_type, std::size_t index = 0) {
    if (entry.IsBindless()) {
        const Tegra::Texture::TextureHandle tex_handle =
            engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset());
@@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                   ScreenInfo& info)
    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
-      shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
+      screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
    state.draw.shader_program = 0;
    state.Apply();
@@ -541,11 +542,16 @@ void RasterizerOpenGL::Clear() {
    } else if (use_stencil) {
        glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
    }
+
+    ++num_queued_commands;
 }

 void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    MICROPROFILE_SCOPE(OpenGL_Drawing);
    auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    query_cache.UpdateCounters();

    SyncRasterizeEnable(state);
    SyncColorMask();
@@ -611,7 +617,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {

    // Setup shaders and their used resources.
    texture_cache.GuardSamplers(true);
-    const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
    SetupShaders(primitive_mode);
    texture_cache.GuardSamplers(false);

@@ -638,35 +644,47 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
        glTextureBarrier();
    }

+    ++num_queued_commands;
+
    const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
    const GLsizei num_instances =
        static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
    if (is_indexed) {
-        const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
        const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
-        glDrawElementsInstancedBaseVertexBaseInstance(
-            primitive_mode, num_vertices, index_format,
-            reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex,
-            base_instance);
+        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
+            glDrawElements(primitive_mode, num_vertices, format, offset);
+        } else if (num_instances == 1 && base_instance == 0) {
+            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
+        } else if (base_vertex == 0 && base_instance == 0) {
+            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
+        } else if (base_vertex == 0) {
+            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
+                                                num_instances, base_instance);
+        } else if (base_instance == 0) {
+            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
+                                              num_instances, base_vertex);
+        } else {
+            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
+                                                          offset, num_instances, base_vertex,
+                                                          base_instance);
+        }
    } else {
        const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
        const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
-        glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances,
-                                          base_instance);
+        if (num_instances == 1 && base_instance == 0) {
+            glDrawArrays(primitive_mode, base_vertex, num_vertices);
+        } else if (base_instance == 0) {
+            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
+        } else {
+            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
+                                              num_instances, base_instance);
+        }
    }
 }

-bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
-}
-
-bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
-
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    if (device.HasBrokenCompute()) {
        return;
@@ -707,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    state.ApplyProgramPipeline();

    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
+    ++num_queued_commands;
+}
+
+void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
 }

 void RasterizerOpenGL::FlushAll() {}
@@ -718,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
    }
    texture_cache.FlushRegion(addr, size);
    buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -728,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
    texture_cache.InvalidateRegion(addr, size);
    shader_cache.InvalidateRegion(addr, size);
    buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }

 void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -738,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 }

 void RasterizerOpenGL::FlushCommands() {
+    // Only flush when we have commands queued to OpenGL.
+    if (num_queued_commands == 0) {
+        return;
+    }
+    num_queued_commands = 0;
    glFlush();
 }

 void RasterizerOpenGL::TickFrame() {
+    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
+    num_queued_commands = 0;
+
    buffer_cache.TickFrame();
 }

@@ -872,15 +910,10 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
    const auto& maxwell3d = system.GPU().Maxwell3D();
    u32 binding = device.GetBaseBindings(stage_index).sampler;
    for (const auto& entry : shader->GetShaderEntries().samplers) {
-        const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
-        if (!entry.IsIndexed()) {
-            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type);
+        const auto shader_type = static_cast<ShaderType>(stage_index);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
            SetupTexture(binding++, texture, entry);
-        } else {
-            for (std::size_t i = 0; i < entry.Size(); ++i) {
-                const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
-                SetupTexture(binding++, texture, entry);
-            }
        }
    }
 }
@@ -890,16 +923,9 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
    const auto& compute = system.GPU().KeplerCompute();
    u32 binding = 0;
    for (const auto& entry : kernel->GetShaderEntries().samplers) {
-        if (!entry.IsIndexed()) {
-            const auto texture =
-                GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
            SetupTexture(binding++, texture, entry);
-        } else {
-            for (std::size_t i = 0; i < entry.Size(); ++i) {
-                const auto texture =
-                    GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i);
-                SetupTexture(binding++, texture, entry);
-            }
        }
    }
 }
@@ -24,6 +24,7 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -57,10 +58,11 @@ public:
                              ScreenInfo& info);
    ~RasterizerOpenGL() override;

-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
    void Clear() override;
    void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
    void FlushAll() override;
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -75,6 +77,11 @@ public:
    void LoadDiskResources(const std::atomic_bool& stop_loading,
                           const VideoCore::DiskResourceLoadCallback& callback) override;

+    /// Returns true when there are commands queued to the OpenGL server.
+    bool AnyCommandQueued() const {
+        return num_queued_commands > 0;
+    }
+
 private:
    /// Configures the color and depth framebuffer states.
    void ConfigureFramebuffers();
@@ -102,9 +109,6 @@ private:
    void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                           std::size_t size);

-    /// Syncs all the state, shaders, render targets and textures setting before a draw call.
-    void Draw(bool is_indexed, bool is_instanced);
-
    /// Configures the current textures to use for the draw command.
    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);

@@ -180,10 +184,23 @@ private:
    /// Syncs the alpha test state to match the guest state
    void SyncAlphaTest();

-    /// Check for extension that are not strictly required
-    /// but are needed for correct emulation
+    /// Check for extension that are not strictly required but are needed for correct emulation
    void CheckExtensions();

+    std::size_t CalculateVertexArraysSize() const;
+
+    std::size_t CalculateIndexBufferSize() const;
+
+    /// Updates and returns a vertex array object representing current vertex format
+    GLuint SetupVertexFormat();
+
+    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
+
+    GLintptr SetupIndexBuffer();
+
+    void SetupShaders(GLenum primitive_mode);
+
    const Device device;
    OpenGLState state;

@@ -191,6 +208,7 @@ private:
    ShaderCacheOpenGL shader_cache;
    SamplerCacheOpenGL sampler_cache;
    FramebufferCacheOpenGL framebuffer_cache;
+    QueryCache query_cache;

    Core::System& system;
    ScreenInfo& screen_info;
@@ -208,19 +226,8 @@ private:
    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};

-    std::size_t CalculateVertexArraysSize() const;
-
-    std::size_t CalculateIndexBufferSize() const;
-
-    /// Updates and returns a vertex array object representing current vertex format
-    GLuint SetupVertexFormat();
-
-    void SetupVertexBuffer(GLuint vao);
-    void SetupVertexInstances(GLuint vao);
-
-    GLintptr SetupIndexBuffer();
-
-    void SetupShaders(GLenum primitive_mode);
+    /// Number of commands queued to the OpenGL driver. Reseted on flush.
+    std::size_t num_queued_commands = 0;
 };

 } // namespace OpenGL
@@ -15,6 +15,24 @@ MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_R

 namespace OpenGL {

+void OGLRenderbuffer::Create() {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glGenRenderbuffers(1, &handle);
+}
+
+void OGLRenderbuffer::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteRenderbuffers(1, &handle);
+    OpenGLState::GetCurState().ResetRenderbuffer(handle).Apply();
+    handle = 0;
+}
+
 void OGLTexture::Create(GLenum target) {
    if (handle != 0)
        return;
@@ -207,4 +225,21 @@ void OGLFramebuffer::Release() {
    handle = 0;
 }

+void OGLQuery::Create(GLenum target) {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glCreateQueries(target, 1, &handle);
+}
+
+void OGLQuery::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteQueries(1, &handle);
+    handle = 0;
+}
+
 } // namespace OpenGL
@@ -11,6 +11,31 @@

 namespace OpenGL {

+class OGLRenderbuffer : private NonCopyable {
+public:
+    OGLRenderbuffer() = default;
+
+    OGLRenderbuffer(OGLRenderbuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLRenderbuffer() {
+        Release();
+    }
+
+    OGLRenderbuffer& operator=(OGLRenderbuffer&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create();
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLTexture : private NonCopyable {
 public:
    OGLTexture() = default;
@@ -266,4 +291,29 @@ public:
    GLuint handle = 0;
 };

+class OGLQuery : private NonCopyable {
+public:
+    OGLQuery() = default;
+
+    OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLQuery() {
+        Release();
+    }
+
+    OGLQuery& operator=(OGLQuery&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create(GLenum target);
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 } // namespace OpenGL
@@ -423,6 +423,13 @@ void OpenGLState::ApplyClipControl() {
    }
 }

+void OpenGLState::ApplyRenderBuffer() {
+    if (cur_state.renderbuffer != renderbuffer) {
+        cur_state.renderbuffer = renderbuffer;
+        glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer);
+    }
+}
+
 void OpenGLState::ApplyTextures() {
    const std::size_t size = std::size(textures);
    for (std::size_t i = 0; i < size; ++i) {
@@ -478,6 +485,7 @@ void OpenGLState::Apply() {
    ApplyPolygonOffset();
    ApplyAlphaTest();
    ApplyClipControl();
+    ApplyRenderBuffer();
 }

 void OpenGLState::EmulateViewportWithScissor() {
@@ -551,4 +559,11 @@ OpenGLState& OpenGLState::ResetFramebuffer(GLuint handle) {
    return *this;
 }

+OpenGLState& OpenGLState::ResetRenderbuffer(GLuint handle) {
+    if (renderbuffer == handle) {
+        renderbuffer = 0;
+    }
+    return *this;
+}
+
 } // namespace OpenGL
@@ -158,6 +158,8 @@ public:
        GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE;
    } clip_control;

+    GLuint renderbuffer{}; // GL_RENDERBUFFER_BINDING
+
    OpenGLState();

    /// Get the currently active OpenGL state
@@ -196,6 +198,7 @@ public:
    void ApplyPolygonOffset();
    void ApplyAlphaTest();
    void ApplyClipControl();
+    void ApplyRenderBuffer();

    /// Resets any references to the given resource
    OpenGLState& UnbindTexture(GLuint handle);
@@ -204,6 +207,7 @@ public:
    OpenGLState& ResetPipeline(GLuint handle);
    OpenGLState& ResetVertexArray(GLuint handle);
    OpenGLState& ResetFramebuffer(GLuint handle);
+    OpenGLState& ResetRenderbuffer(GLuint handle);

    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
    void EmulateViewportWithScissor();
@@ -87,6 +87,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
    {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false},                             // RG32UI
    {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false},                                     // RGBX16F
    {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false},                             // R32UI
+    {GL_R32I, GL_RED_INTEGER, GL_INT, false},                                       // R32I
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X8
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X5
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_5X4
@@ -260,6 +261,13 @@ CachedSurface::~CachedSurface() = default;
 void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
    MICROPROFILE_SCOPE(OpenGL_Texture_Download);

+    if (params.IsBuffer()) {
+        glGetNamedBufferSubData(texture_buffer.handle, 0,
+                                static_cast<GLsizeiptr>(params.GetHostSizeInBytes()),
+                                staging_buffer.data());
+        return;
+    }
+
    SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); });

    for (u32 level = 0; level < params.emulated_levels; ++level) {
@@ -398,24 +406,36 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
 CachedSurfaceView::~CachedSurfaceView() = default;

 void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
-    ASSERT(params.num_layers == 1 && params.num_levels == 1);
+    ASSERT(params.num_levels == 1);

-    const auto& owner_params = surface.GetSurfaceParams();
+    const GLuint texture = surface.GetTexture();
+    if (params.num_layers > 1) {
+        // Layered framebuffer attachments
+        UNIMPLEMENTED_IF(params.base_layer != 0);

-    switch (owner_params.target) {
+        switch (params.target) {
+        case SurfaceTarget::Texture2DArray:
+            glFramebufferTexture(target, attachment, texture, params.base_level);
+            break;
+        default:
+            UNIMPLEMENTED();
+        }
+        return;
+    }
+
+    const GLenum view_target = surface.GetTarget();
+    switch (surface.GetSurfaceParams().target) {
    case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, surface.GetTarget(), surface.GetTexture(),
-                               params.base_level);
+        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
        break;
    case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, surface.GetTarget(), surface.GetTexture(),
-                               params.base_level);
+        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
        break;
    case SurfaceTarget::Texture1DArray:
    case SurfaceTarget::Texture2DArray:
    case SurfaceTarget::TextureCubemap:
    case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, surface.GetTexture(), params.base_level,
+        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
                                  params.base_layer);
        break;
    default:
@@ -92,8 +92,32 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
        }
    case Maxwell::VertexAttribute::Type::UnsignedScaled:
        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
        case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
            return GL_UNSIGNED_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_UNSIGNED_SHORT;
+        default:
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            return {};
+        }
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return GL_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_SHORT;
        default:
            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
            return {};
@@ -9,11 +9,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "common/telemetry.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
-#include "core/frontend/scope_acquire_window_context.h"
 #include "core/memory.h"
 #include "core/perf_stats.h"
 #include "core/settings.h"
@@ -24,6 +24,144 @@

 namespace OpenGL {

+// If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have
+// to wait on available presentation frames.
+constexpr std::size_t SWAP_CHAIN_SIZE = 3;
+
+struct Frame {
+    u32 width{};                      /// Width of the frame (to detect resize)
+    u32 height{};                     /// Height of the frame
+    bool color_reloaded{};            /// Texture attachment was recreated (ie: resized)
+    OpenGL::OGLRenderbuffer color{};  /// Buffer shared between the render/present FBO
+    OpenGL::OGLFramebuffer render{};  /// FBO created on the render thread
+    OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread
+    GLsync render_fence{};            /// Fence created on the render thread
+    GLsync present_fence{};           /// Fence created on the presentation thread
+    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
+};
+
+/**
+ * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
+ * but also make sure that rendering happens at the pace that the frontend dictates. This is a
+ * helper class that the renderer uses to sync frames between the render thread and the presentation
+ * thread
+ */
+class FrameMailbox {
+public:
+    std::mutex swap_chain_lock;
+    std::condition_variable present_cv;
+    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
+    std::queue<Frame*> free_queue;
+    std::deque<Frame*> present_queue;
+    Frame* previous_frame{};
+
+    FrameMailbox() {
+        for (auto& frame : swap_chain) {
+            free_queue.push(&frame);
+        }
+    }
+
+    ~FrameMailbox() {
+        // lock the mutex and clear out the present and free_queues and notify any people who are
+        // blocked to prevent deadlock on shutdown
+        std::scoped_lock lock{swap_chain_lock};
+        std::queue<Frame*>().swap(free_queue);
+        present_queue.clear();
+        present_cv.notify_all();
+    }
+
+    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
+        frame->present.Release();
+        frame->present.Create();
+        GLint previous_draw_fbo{};
+        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
+        }
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
+        frame->color_reloaded = false;
+    }
+
+    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
+        OpenGLState prev_state = OpenGLState::GetCurState();
+        OpenGLState state = OpenGLState::GetCurState();
+
+        // Recreate the color texture attachment
+        frame->color.Release();
+        frame->color.Create();
+        state.renderbuffer = frame->color.handle;
+        state.Apply();
+        glRenderbufferStorage(GL_RENDERBUFFER, frame->is_srgb ? GL_SRGB8 : GL_RGB8, width, height);
+
+        // Recreate the FBO for the render target
+        frame->render.Release();
+        frame->render.Create();
+        state.draw.read_framebuffer = frame->render.handle;
+        state.draw.draw_framebuffer = frame->render.handle;
+        state.Apply();
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
+        }
+        prev_state.Apply();
+        frame->width = width;
+        frame->height = height;
+        frame->color_reloaded = true;
+    }
+
+    Frame* GetRenderFrame() {
+        std::unique_lock lock{swap_chain_lock};
+
+        // If theres no free frames, we will reuse the oldest render frame
+        if (free_queue.empty()) {
+            auto frame = present_queue.back();
+            present_queue.pop_back();
+            return frame;
+        }
+
+        Frame* frame = free_queue.front();
+        free_queue.pop();
+        return frame;
+    }
+
+    void ReleaseRenderFrame(Frame* frame) {
+        std::unique_lock lock{swap_chain_lock};
+        present_queue.push_front(frame);
+        present_cv.notify_one();
+    }
+
+    Frame* TryGetPresentFrame(int timeout_ms) {
+        std::unique_lock lock{swap_chain_lock};
+        // wait for new entries in the present_queue
+        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
+                            [&] { return !present_queue.empty(); });
+        if (present_queue.empty()) {
+            // timed out waiting for a frame to draw so return the previous frame
+            return previous_frame;
+        }
+
+        // free the previous frame and add it back to the free queue
+        if (previous_frame) {
+            free_queue.push(previous_frame);
+        }
+
+        // the newest entries are pushed to the front of the queue
+        Frame* frame = present_queue.front();
+        present_queue.pop_front();
+        // remove all old entries from the present queue and move them back to the free_queue
+        for (auto f : present_queue) {
+            free_queue.push(f);
+        }
+        present_queue.clear();
+        previous_frame = frame;
+        return frame;
+    }
+};
+
 namespace {

 constexpr char vertex_shader[] = R"(
@@ -158,21 +296,91 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
 } // Anonymous namespace

 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
-    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system} {}
+    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
+      frame_mailbox{std::make_unique<FrameMailbox>()} {}

 RendererOpenGL::~RendererOpenGL() = default;

+MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64));
+MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128));
+
 void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    render_window.PollEvents();
+
+    if (!framebuffer) {
+        return;
+    }
+
    // Maintain the rasterizer's state as a priority
    OpenGLState prev_state = OpenGLState::GetCurState();
    state.AllDirty();
    state.Apply();

+    PrepareRendertarget(framebuffer);
+    RenderScreenshot();
+
+    Frame* frame;
+    {
+        MICROPROFILE_SCOPE(OpenGL_WaitPresent);
+
+        frame = frame_mailbox->GetRenderFrame();
+
+        // Clean up sync objects before drawing
+
+        // INTEL driver workaround. We can't delete the previous render sync object until we are
+        // sure that the presentation is done
+        if (frame->present_fence) {
+            glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
+        }
+
+        // delete the draw fence if the frame wasn't presented
+        if (frame->render_fence) {
+            glDeleteSync(frame->render_fence);
+            frame->render_fence = 0;
+        }
+
+        // wait for the presentation to be done
+        if (frame->present_fence) {
+            glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED);
+            glDeleteSync(frame->present_fence);
+            frame->present_fence = 0;
+        }
+    }
+
+    {
+        MICROPROFILE_SCOPE(OpenGL_RenderFrame);
+        const auto& layout = render_window.GetFramebufferLayout();
+
+        // Recreate the frame if the size of the window has changed
+        if (layout.width != frame->width || layout.height != frame->height ||
+            screen_info.display_srgb != frame->is_srgb) {
+            LOG_DEBUG(Render_OpenGL, "Reloading render frame");
+            frame->is_srgb = screen_info.display_srgb;
+            frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
+        }
+        state.draw.draw_framebuffer = frame->render.handle;
+        state.Apply();
+        DrawScreen(layout);
+        // Create a fence for the frontend to wait on and swap this frame to OffTex
+        frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        glFlush();
+        frame_mailbox->ReleaseRenderFrame(frame);
+        m_current_frame++;
+        rasterizer->TickFrame();
+    }
+
+    // Restore the rasterizer state
+    prev_state.AllDirty();
+    prev_state.Apply();
+}
+
+void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
    if (framebuffer) {
        // If framebuffer is provided, reload it from memory to a texture
        if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
            screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
-            screen_info.texture.pixel_format != framebuffer->pixel_format) {
+            screen_info.texture.pixel_format != framebuffer->pixel_format ||
+            gl_framebuffer_data.empty()) {
            // Reallocate texture if the framebuffer size has changed.
            // This is expected to not happen very often and hence should not be a
            // performance problem.
@@ -181,22 +389,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {

        // Load the framebuffer from memory, draw it to the screen, and swap buffers
        LoadFBToScreenInfo(*framebuffer);
-
-        if (renderer_settings.screenshot_requested)
-            CaptureScreenshot();
-
-        DrawScreen(render_window.GetFramebufferLayout());
-
-        rasterizer->TickFrame();
-
-        render_window.SwapBuffers();
    }
-
-    render_window.PollEvents();
-
-    // Restore the rasterizer state
-    prev_state.AllDirty();
-    prev_state.Apply();
 }

 void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
@@ -418,13 +611,48 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
    DrawScreenTriangles(screen_info, static_cast<float>(screen.left),
                        static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()),
                        static_cast<float>(screen.GetHeight()));
-
-    m_current_frame++;
 }

-void RendererOpenGL::UpdateFramerate() {}
+void RendererOpenGL::TryPresent(int timeout_ms) {
+    const auto& layout = render_window.GetFramebufferLayout();
+    auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms);
+    if (!frame) {
+        LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present");
+        return;
+    }
+
+    // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a
+    // readback since we won't be doing any blending
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    // Recreate the presentation FBO if the color attachment was changed
+    if (frame->color_reloaded) {
+        LOG_DEBUG(Render_OpenGL, "Reloading present frame");
+        frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height);
+    }
+    glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED);
+    // INTEL workaround.
+    // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete
+    // it on the emulation thread without too much penalty
+    // glDeleteSync(frame.render_sync);
+    // frame.render_sync = 0;
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle);
+    glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height,
+                      GL_COLOR_BUFFER_BIT, GL_LINEAR);
+
+    // Insert fence for the main thread to block on
+    frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    glFlush();
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+}
+
+void RendererOpenGL::RenderScreenshot() {
+    if (!renderer_settings.screenshot_requested) {
+        return;
+    }

-void RendererOpenGL::CaptureScreenshot() {
    // Draw the current frame to the screenshot framebuffer
    screenshot_framebuffer.Create();
    GLuint old_read_fb = state.draw.read_framebuffer;
@@ -459,8 +687,6 @@ void RendererOpenGL::CaptureScreenshot() {
 }

 bool RendererOpenGL::Init() {
-    Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window};
-
    if (GLAD_GL_KHR_debug) {
        glEnable(GL_DEBUG_OUTPUT);
        glDebugMessageCallback(DebugHandler, nullptr);
@@ -44,19 +44,23 @@ struct ScreenInfo {
    TextureInfo texture;
 };

+struct PresentationTexture {
+    u32 width = 0;
+    u32 height = 0;
+    OGLTexture texture;
+};
+
+class FrameMailbox;
+
 class RendererOpenGL final : public VideoCore::RendererBase {
 public:
    explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system);
    ~RendererOpenGL() override;

-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-
-    /// Initialize the renderer
    bool Init() override;
-
-    /// Shutdown the renderer
    void ShutDown() override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void TryPresent(int timeout_ms) override;

 private:
    /// Initializes the OpenGL state and creates persistent objects.
@@ -74,10 +78,7 @@ private:

    void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h);

-    /// Updates the framerate.
-    void UpdateFramerate();
-
-    void CaptureScreenshot();
+    void RenderScreenshot();

    /// Loads framebuffer from emulated memory into the active OpenGL texture.
    void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer);
@@ -87,6 +88,8 @@ private:
    void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
                                    const TextureInfo& texture);

+    void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer);
+
    Core::Frontend::EmuWindow& emu_window;
    Core::System& system;

@@ -107,6 +110,9 @@ private:
    /// Used for transforming the framebuffer orientation
    Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
    Common::Rectangle<int> framebuffer_crop_rect;
+
+    /// Frame presentation mailbox
+    std::unique_ptr<FrameMailbox> frame_mailbox;
 };

 } // namespace OpenGL
@@ -120,7 +120,7 @@ struct FormatTuple {
    {vk::Format::eA8B8G8R8UintPack32, Attachable | Storage},     // ABGR8UI
    {vk::Format::eB5G6R5UnormPack16, {}},                        // B5G6R5U
    {vk::Format::eA2B10G10R10UnormPack32, Attachable | Storage}, // A2B10G10R10U
-    {vk::Format::eA1R5G5B5UnormPack16, Attachable | Storage},    // A1B5G5R5U (flipped with swizzle)
+    {vk::Format::eA1R5G5B5UnormPack16, Attachable},              // A1B5G5R5U (flipped with swizzle)
    {vk::Format::eR8Unorm, Attachable | Storage},                // R8U
    {vk::Format::eR8Uint, Attachable | Storage},                 // R8UI
    {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage},     // RGBA16F
@@ -159,12 +159,13 @@ struct FormatTuple {
    {vk::Format::eR32G32Uint, Attachable | Storage},             // RG32UI
    {vk::Format::eUndefined, {}},                                // RGBX16F
    {vk::Format::eR32Uint, Attachable | Storage},                // R32UI
+    {vk::Format::eR32Sint, Attachable | Storage},                // R32I
    {vk::Format::eAstc8x8UnormBlock, {}},                        // ASTC_2D_8X8
    {vk::Format::eUndefined, {}},                                // ASTC_2D_8X5
    {vk::Format::eUndefined, {}},                                // ASTC_2D_5X4
    {vk::Format::eUndefined, {}},                                // BGRA8_SRGB
    {vk::Format::eBc1RgbaSrgbBlock, {}},                         // DXT1_SRGB
-    {vk::Format::eUndefined, {}},                                // DXT23_SRGB
+    {vk::Format::eBc2SrgbBlock, {}},                             // DXT23_SRGB
    {vk::Format::eBc3SrgbBlock, {}},                             // DXT45_SRGB
    {vk::Format::eBc7SrgbBlock, {}},                             // BC7U_SRGB
    {vk::Format::eR4G4B4A4UnormPack16, Attachable},              // R4G4B4A4U
@@ -363,13 +364,29 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
            return vk::Format::eR8G8B8A8Uint;
        case Maxwell::VertexAttribute::Size::Size_32:
            return vk::Format::eR32Uint;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return vk::Format::eR32G32B32A32Uint;
        default:
            break;
        }
    case Maxwell::VertexAttribute::Type::UnsignedScaled:
        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return vk::Format::eR8Uscaled;
        case Maxwell::VertexAttribute::Size::Size_8_8:
            return vk::Format::eR8G8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return vk::Format::eR8G8B8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return vk::Format::eR8G8B8A8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return vk::Format::eR16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return vk::Format::eR16G16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return vk::Format::eR16G16B16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return vk::Format::eR16G16B16A16Uscaled;
        default:
            break;
        }
@@ -106,8 +106,14 @@ RendererVulkan::~RendererVulkan() {
 }

 void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    render_window.PollEvents();
+
+    if (!framebuffer) {
+        return;
+    }
+
    const auto& layout = render_window.GetFramebufferLayout();
-    if (framebuffer && layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
+    if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
        const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset;
        const bool use_accelerated =
            rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
@@ -128,13 +134,16 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
            blit_screen->Recreate();
        }

-        render_window.SwapBuffers();
        rasterizer->TickFrame();
    }

    render_window.PollEvents();
 }

+void RendererVulkan::TryPresent(int /*timeout_ms*/) {
+    // TODO (bunnei): ImplementMe
+}
+
 bool RendererVulkan::Init() {
    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{};
    render_window.RetrieveVulkanHandlers(&vkGetInstanceProcAddr, &instance, &surface);
@@ -262,4 +271,4 @@ void RendererVulkan::Report() const {
    telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
 }

-} // namespace Vulkan
+} // namespace Vulkan
@@ -36,14 +36,10 @@ public:
    explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system);
    ~RendererVulkan() override;

-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
-
-    /// Initialize the renderer
    bool Init() override;
-
-    /// Shutdown the renderer
    void ShutDown() override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void TryPresent(int timeout_ms) override;

 private:
    std::optional<vk::DebugUtilsMessengerEXT> CreateDebugCallback(
@@ -73,7 +73,7 @@ UniqueDescriptorUpdateTemplate VKComputePipeline::CreateDescriptorUpdateTemplate
    std::vector<vk::DescriptorUpdateTemplateEntry> template_entries;
    u32 binding = 0;
    u32 offset = 0;
-    FillDescriptorUpdateTemplateEntries(device, entries, binding, offset, template_entries);
+    FillDescriptorUpdateTemplateEntries(entries, binding, offset, template_entries);
    if (template_entries.empty()) {
        // If the shader doesn't use descriptor sets, skip template creation.
        return UniqueDescriptorUpdateTemplate{};
@@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
    features.depthBiasClamp = true;
    features.geometryShader = true;
    features.tessellationShader = true;
+    features.occlusionQueryPrecise = true;
    features.fragmentStoresAndAtomics = true;
    features.shaderImageGatherExtended = true;
+    features.shaderStorageImageReadWithoutFormat =
+        is_shader_storage_img_read_without_format_supported;
    features.shaderStorageImageWriteWithoutFormat = true;
    features.textureCompressionASTC_LDR = is_optimal_astc_supported;

@@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
    bit8_storage.uniformAndStorageBuffer8BitAccess = true;
    SetNext(next, bit8_storage);

+    vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset;
+    host_query_reset.hostQueryReset = true;
+    SetNext(next, host_query_reset);
+
    vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
    if (is_float16_supported) {
        float16_int8.shaderFloat16 = true;
@@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
        VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
        VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
        VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
+        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
    };
    std::bitset<required_extensions.size()> available_extensions{};

@@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
        std::make_pair(features.depthBiasClamp, "depthBiasClamp"),
        std::make_pair(features.geometryShader, "geometryShader"),
        std::make_pair(features.tessellationShader, "tessellationShader"),
+        std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"),
        std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"),
        std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"),
        std::make_pair(features.shaderStorageImageWriteWithoutFormat,
@@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        }
    };

-    extensions.reserve(13);
+    extensions.reserve(14);
    extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
    extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
    extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
    extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);
    extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME);
    extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME);
+    extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);

    [[maybe_unused]] const bool nsight =
        std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
@@ -457,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK

 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
    const auto supported_features{physical.getFeatures(dldi)};
+    is_shader_storage_img_read_without_format_supported =
+        supported_features.shaderStorageImageReadWithoutFormat;
    is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }

@@ -511,6 +523,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                        vk::Format::eB10G11R11UfloatPack32,
                                        vk::Format::eR32Sfloat,
                                        vk::Format::eR32Uint,
+                                        vk::Format::eR32Sint,
                                        vk::Format::eR16Sfloat,
                                        vk::Format::eR16G16B16A16Sfloat,
                                        vk::Format::eB8G8R8A8Unorm,
@@ -530,6 +543,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                        vk::Format::eBc6HUfloatBlock,
                                        vk::Format::eBc6HSfloatBlock,
                                        vk::Format::eBc1RgbaSrgbBlock,
+                                        vk::Format::eBc2SrgbBlock,
                                        vk::Format::eBc3SrgbBlock,
                                        vk::Format::eBc7SrgbBlock,
                                        vk::Format::eAstc4x4SrgbBlock,
@@ -122,6 +122,11 @@ public:
        return properties.limits.maxPushConstantsSize;
    }

+    /// Returns true if Shader storage Image Read Without Format supported.
+    bool IsShaderStorageImageReadWithoutFormatSupported() const {
+        return is_shader_storage_img_read_without_format_supported;
+    }
+
    /// Returns true if ASTC is natively supported.
    bool IsOptimalAstcSupported() const {
        return is_optimal_astc_supported;
@@ -227,6 +232,8 @@ private:
    bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
    bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
    bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
+    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
+                                                                ///< image read without format

    // Telemetry parameters
    std::string vendor_name;                      ///< Device's driver name.
@@ -97,8 +97,7 @@ UniqueDescriptorUpdateTemplate VKGraphicsPipeline::CreateDescriptorUpdateTemplat
    u32 offset = 0;
    for (const auto& stage : program) {
        if (stage) {
-            FillDescriptorUpdateTemplateEntries(device, stage->entries, binding, offset,
-                                                template_entries);
+            FillDescriptorUpdateTemplateEntries(stage->entries, binding, offset, template_entries);
        }
    }
    if (template_entries.empty()) {
@@ -36,6 +36,13 @@ using Tegra::Engines::ShaderType;

 namespace {

+// C++20's using enum
+constexpr auto eUniformBuffer = vk::DescriptorType::eUniformBuffer;
+constexpr auto eStorageBuffer = vk::DescriptorType::eStorageBuffer;
+constexpr auto eUniformTexelBuffer = vk::DescriptorType::eUniformTexelBuffer;
+constexpr auto eCombinedImageSampler = vk::DescriptorType::eCombinedImageSampler;
+constexpr auto eStorageImage = vk::DescriptorType::eStorageImage;
+
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
    VideoCommon::Shader::CompileDepth::FullDecompile};

@@ -119,23 +126,32 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) {
    }
 }

+template <vk::DescriptorType descriptor_type, class Container>
+void AddBindings(std::vector<vk::DescriptorSetLayoutBinding>& bindings, u32& binding,
+                 vk::ShaderStageFlags stage_flags, const Container& container) {
+    const u32 num_entries = static_cast<u32>(std::size(container));
+    for (std::size_t i = 0; i < num_entries; ++i) {
+        u32 count = 1;
+        if constexpr (descriptor_type == eCombinedImageSampler) {
+            // Combined image samplers can be arrayed.
+            count = container[i].Size();
+        }
+        bindings.emplace_back(binding++, descriptor_type, count, stage_flags, nullptr);
+    }
+}
+
 u32 FillDescriptorLayout(const ShaderEntries& entries,
                         std::vector<vk::DescriptorSetLayoutBinding>& bindings,
                         Maxwell::ShaderProgram program_type, u32 base_binding) {
    const ShaderType stage = GetStageFromProgram(program_type);
-    const vk::ShaderStageFlags stage_flags = MaxwellToVK::ShaderStage(stage);
+    const vk::ShaderStageFlags flags = MaxwellToVK::ShaderStage(stage);

    u32 binding = base_binding;
-    const auto AddBindings = [&](vk::DescriptorType descriptor_type, std::size_t num_entries) {
-        for (std::size_t i = 0; i < num_entries; ++i) {
-            bindings.emplace_back(binding++, descriptor_type, 1, stage_flags, nullptr);
-        }
-    };
-    AddBindings(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddBindings(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddBindings(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddBindings(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddBindings(vk::DescriptorType::eStorageImage, entries.images.size());
+    AddBindings<eUniformBuffer>(bindings, binding, flags, entries.const_buffers);
+    AddBindings<eStorageBuffer>(bindings, binding, flags, entries.global_buffers);
+    AddBindings<eUniformTexelBuffer>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<eCombinedImageSampler>(bindings, binding, flags, entries.samplers);
+    AddBindings<eStorageImage>(bindings, binding, flags, entries.images);
    return binding;
 }

@@ -361,32 +377,45 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
    return {std::move(program), std::move(bindings)};
 }

-void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
-    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
-    static constexpr auto entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
-    const auto AddEntry = [&](vk::DescriptorType descriptor_type, std::size_t count_) {
-        const u32 count = static_cast<u32>(count_);
-        if (descriptor_type == vk::DescriptorType::eUniformTexelBuffer &&
-            device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
-            // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-            // crash.
-            for (u32 i = 0; i < count; ++i) {
-                template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
-                                              offset + i * entry_size, entry_size);
-            }
-        } else if (count != 0) {
-            template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
-        }
-        offset += count * entry_size;
-        binding += count;
-    };
+template <vk::DescriptorType descriptor_type, class Container>
+void AddEntry(std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries, u32& binding,
+              u32& offset, const Container& container) {
+    static constexpr u32 entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
+    const u32 count = static_cast<u32>(std::size(container));

-    AddEntry(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddEntry(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddEntry(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddEntry(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddEntry(vk::DescriptorType::eStorageImage, entries.images.size());
+    if constexpr (descriptor_type == eCombinedImageSampler) {
+        for (u32 i = 0; i < count; ++i) {
+            const u32 num_samplers = container[i].Size();
+            template_entries.emplace_back(binding, 0, num_samplers, descriptor_type, offset,
+                                          entry_size);
+            ++binding;
+            offset += num_samplers * entry_size;
+        }
+        return;
+    }
+
+    if constexpr (descriptor_type == eUniformTexelBuffer) {
+        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
+        // crash.
+        for (u32 i = 0; i < count; ++i) {
+            template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
+                                          offset + i * entry_size, entry_size);
+        }
+    } else if (count > 0) {
+        template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
+    }
+    offset += count * entry_size;
+    binding += count;
+}
+
+void FillDescriptorUpdateTemplateEntries(
+    const ShaderEntries& entries, u32& binding, u32& offset,
+    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
+    AddEntry<eUniformBuffer>(template_entries, offset, binding, entries.const_buffers);
+    AddEntry<eStorageBuffer>(template_entries, offset, binding, entries.global_buffers);
+    AddEntry<eUniformTexelBuffer>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<eCombinedImageSampler>(template_entries, offset, binding, entries.samplers);
+    AddEntry<eStorageImage>(template_entries, offset, binding, entries.images);
 }

 } // namespace Vulkan
@@ -194,7 +194,7 @@ private:
 };

 void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
+    const ShaderEntries& entries, u32& binding, u32& offset,
    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries);

 } // namespace Vulkan
@@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace Vulkan {
+
+namespace {
+
+constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion};
+
+constexpr vk::QueryType GetTarget(VideoCore::QueryType type) {
+    return QUERY_TARGETS[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+
+QueryPool::~QueryPool() = default;
+
+void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
+    device = &device_;
+    type = type_;
+}
+
+std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) {
+    std::size_t index;
+    do {
+        index = CommitResource(fence);
+    } while (usage[index]);
+    usage[index] = true;
+
+    return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)};
+}
+
+void QueryPool::Allocate(std::size_t begin, std::size_t end) {
+    usage.resize(end);
+
+    const auto dev = device->GetLogical();
+    const u32 size = static_cast<u32>(end - begin);
+    const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {});
+    pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader()));
+}
+
+void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) {
+    const auto it =
+        std::find_if(std::begin(pools), std::end(pools),
+                     [query_pool = query.first](auto& pool) { return query_pool == *pool; });
+    ASSERT(it != std::end(pools));
+
+    const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it);
+    usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
+}
+
+VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           const VKDevice& device, VKScheduler& scheduler)
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                  QueryPool>{system, rasterizer},
+      device{device}, scheduler{scheduler} {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
+        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    }
+}
+
+VKQueryCache::~VKQueryCache() = default;
+
+std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+}
+
+void VKQueryCache::Reserve(VideoCore::QueryType type,
+                           std::pair<vk::QueryPool, std::uint32_t> query) {
+    query_pools[static_cast<std::size_t>(type)].Reserve(query);
+}
+
+HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+    const auto dev = cache.Device().GetLogical();
+    cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) {
+        dev.resetQueryPoolEXT(query.first, query.second, 1, dld);
+        cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld);
+    });
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, query);
+}
+
+void HostCounter::EndQuery() {
+    cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) {
+        cmdbuf.endQuery(query.first, query.second, dld);
+    });
+}
+
+u64 HostCounter::BlockingQuery() const {
+    if (ticks >= cache.Scheduler().Ticks()) {
+        cache.Scheduler().Flush();
+    }
+
+    const auto dev = cache.Device().GetLogical();
+    const auto& dld = cache.Device().GetDispatchLoader();
+    u64 value;
+    dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value),
+                            vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld);
+    return value;
+}
+
+} // namespace Vulkan
@@ -0,0 +1,104 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class CachedQuery;
+class HostCounter;
+class VKDevice;
+class VKQueryCache;
+class VKScheduler;
+
+using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
+
+class QueryPool final : public VKFencedPool {
+public:
+    explicit QueryPool();
+    ~QueryPool() override;
+
+    void Initialize(const VKDevice& device, VideoCore::QueryType type);
+
+    std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence);
+
+    void Reserve(std::pair<vk::QueryPool, std::uint32_t> query);
+
+protected:
+    void Allocate(std::size_t begin, std::size_t end) override;
+
+private:
+    static constexpr std::size_t GROW_STEP = 512;
+
+    const VKDevice* device = nullptr;
+    VideoCore::QueryType type = {};
+
+    std::vector<UniqueQueryPool> pools;
+    std::vector<bool> usage;
+};
+
+class VKQueryCache final
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                         QueryPool> {
+public:
+    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                          const VKDevice& device, VKScheduler& scheduler);
+    ~VKQueryCache();
+
+    std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query);
+
+    const VKDevice& Device() const noexcept {
+        return device;
+    }
+
+    VKScheduler& Scheduler() const noexcept {
+        return scheduler;
+    }
+
+private:
+    const VKDevice& device;
+    VKScheduler& scheduler;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
+public:
+    explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    VKQueryCache& cache;
+    const VideoCore::QueryType type;
+    const std::pair<vk::QueryPool, std::uint32_t> query;
+    const u64 ticks;
+};
+
+class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr)
+        : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {}
+};
+
+} // namespace Vulkan
@@ -105,17 +105,20 @@ void TransitionImages(const std::vector<ImageView>& views, vk::PipelineStageFlag

 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
-                                               std::size_t stage) {
+                                               std::size_t stage, std::size_t index = 0) {
    const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
    if (entry.IsBindless()) {
        const Tegra::Texture::TextureHandle tex_handle =
            engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset());
        return engine.GetTextureInfo(tex_handle);
    }
+    const auto& gpu_profile = engine.AccessGuestDriverProfile();
+    const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
+    const u32 offset = entry.GetOffset() + entry_offset;
    if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
-        return engine.GetStageTexture(stage_type, entry.GetOffset());
+        return engine.GetStageTexture(stage_type, offset);
    } else {
-        return engine.GetTexture(entry.GetOffset());
+        return engine.GetTexture(offset);
    }
 }

@@ -289,25 +292,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
                    staging_pool),
      pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue),
      buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device) {}
+      sampler_cache(device), query_cache(system, *this, device, scheduler) {
+    scheduler.SetQueryCache(query_cache);
+}

 RasterizerVulkan::~RasterizerVulkan() = default;

-bool RasterizerVulkan::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
-}
-
-bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
-
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
    MICROPROFILE_SCOPE(Vulkan_Drawing);

    FlushWork();

+    query_cache.UpdateCounters();
+
    const auto& gpu = system.GPU().Maxwell3D();
    GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)};

@@ -362,6 +359,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
    MICROPROFILE_SCOPE(Vulkan_Clearing);

+    query_cache.UpdateCounters();
+
    const auto& gpu = system.GPU().Maxwell3D();
    if (!system.GPU().Maxwell3D().ShouldExecute()) {
        return;
@@ -429,6 +428,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
    sampled_views.clear();
    image_views.clear();

+    query_cache.UpdateCounters();
+
    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
    const ComputePipelineCacheKey key{
        code_addr,
@@ -471,17 +472,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
    });
 }

+void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
+}
+
 void RasterizerVulkan::FlushAll() {}

 void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) {
    texture_cache.FlushRegion(addr, size);
    buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }

 void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) {
    texture_cache.InvalidateRegion(addr, size);
    pipeline_cache.InvalidateRegion(addr, size);
    buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }

 void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -602,33 +614,34 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen
 std::tuple<vk::Framebuffer, vk::Extent2D> RasterizerVulkan::ConfigureFramebuffers(
    vk::RenderPass renderpass) {
    FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(),
-                            std::numeric_limits<u32>::max()};
+                            std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};

-    const auto MarkAsModifiedAndPush = [&](const View& view) {
-        if (view == nullptr) {
+    const auto try_push = [&](const View& view) {
+        if (!view) {
            return false;
        }
        key.views.push_back(view->GetHandle());
        key.width = std::min(key.width, view->GetWidth());
        key.height = std::min(key.height, view->GetHeight());
+        key.layers = std::min(key.layers, view->GetNumLayers());
        return true;
    };

    for (std::size_t index = 0; index < std::size(color_attachments); ++index) {
-        if (MarkAsModifiedAndPush(color_attachments[index])) {
+        if (try_push(color_attachments[index])) {
            texture_cache.MarkColorBufferInUse(index);
        }
    }
-    if (MarkAsModifiedAndPush(zeta_attachment)) {
+    if (try_push(zeta_attachment)) {
        texture_cache.MarkDepthBufferInUse();
    }

    const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key);
    auto& framebuffer = fbentry->second;
    if (is_cache_miss) {
-        const vk::FramebufferCreateInfo framebuffer_ci({}, key.renderpass,
-                                                       static_cast<u32>(key.views.size()),
-                                                       key.views.data(), key.width, key.height, 1);
+        const vk::FramebufferCreateInfo framebuffer_ci(
+            {}, key.renderpass, static_cast<u32>(key.views.size()), key.views.data(), key.width,
+            key.height, key.layers);
        const auto dev = device.GetLogical();
        const auto& dld = device.GetDispatchLoader();
        framebuffer = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld);
@@ -826,14 +839,16 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
    MICROPROFILE_SCOPE(Vulkan_Textures);
    const auto& gpu = system.GPU().Maxwell3D();
    for (const auto& entry : entries.samplers) {
-        const auto texture = GetTextureInfo(gpu, entry, stage);
-        SetupTexture(texture, entry);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(gpu, entry, stage, i);
+            SetupTexture(texture, entry);
+        }
    }
 }

 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
    MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().KeplerCompute();
+    const auto& gpu = system.GPU().Maxwell3D();
    for (const auto& entry : entries.images) {
        const auto tic = GetTextureInfo(gpu, entry, stage).tic;
        SetupImage(tic, entry);
@@ -876,8 +891,10 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
    MICROPROFILE_SCOPE(Vulkan_Textures);
    const auto& gpu = system.GPU().KeplerCompute();
    for (const auto& entry : entries.samplers) {
-        const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex);
-        SetupTexture(texture, entry);
+        for (std::size_t i = 0; i < entry.Size(); ++i) {
+            const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i);
+            SetupTexture(texture, entry);
+        }
    }
 }

@@ -24,6 +24,7 @@
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
@@ -55,6 +56,7 @@ struct FramebufferCacheKey {
    vk::RenderPass renderpass{};
    u32 width = 0;
    u32 height = 0;
+    u32 layers = 0;
    ImageViewsPack views;

    std::size_t Hash() const noexcept {
@@ -65,12 +67,17 @@ struct FramebufferCacheKey {
        }
        boost::hash_combine(hash, width);
        boost::hash_combine(hash, height);
+        boost::hash_combine(hash, layers);
        return hash;
    }

    bool operator==(const FramebufferCacheKey& rhs) const noexcept {
-        return std::tie(renderpass, views, width, height) ==
-               std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height);
+        return std::tie(renderpass, views, width, height, layers) ==
+               std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers);
+    }
+
+    bool operator!=(const FramebufferCacheKey& rhs) const noexcept {
+        return !operator==(rhs);
    }
 };

@@ -96,7 +103,7 @@ struct ImageView {
    vk::ImageLayout* layout = nullptr;
 };

-class RasterizerVulkan : public VideoCore::RasterizerAccelerated {
+class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
                              VKScreenInfo& screen_info, const VKDevice& device,
@@ -104,10 +111,11 @@ public:
                              VKScheduler& scheduler);
    ~RasterizerVulkan() override;

-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
    void Clear() override;
    void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
    void FlushAll() override;
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -140,8 +148,6 @@ private:

    static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;

-    void Draw(bool is_indexed, bool is_instanced);
-
    void FlushWork();

    Texceptions UpdateAttachments();
@@ -247,6 +253,7 @@ private:
    VKPipelineCache pipeline_cache;
    VKBufferCache buffer_cache;
    VKSamplerCache sampler_cache;
+    VKQueryCache query_cache;

    std::array<View, Maxwell::NumRenderTargets> color_attachments;
    View zeta_attachment;
@@ -23,7 +23,14 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4>
    } else if (color == std::array<float, 4>{1, 1, 1, 1}) {
        return vk::BorderColor::eFloatOpaqueWhite;
    } else {
-        return {};
+        if (color[0] + color[1] + color[2] > 1.35f) {
+            // If color elements are brighter than roughly 0.5 average, use white border
+            return vk::BorderColor::eFloatOpaqueWhite;
+        }
+        if (color[3] > 0.5f) {
+            return vk::BorderColor::eFloatOpaqueBlack;
+        }
+        return vk::BorderColor::eFloatTransparentBlack;
    }
 }

@@ -37,8 +44,6 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)

    const auto border_color{tsc.GetBorderColor()};
    const auto vk_border_color{TryConvertBorderColor(border_color)};
-    UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}",
-                         border_color[0], border_color[1], border_color[2], border_color[3]);

    constexpr bool unnormalized_coords{false};

@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"

@@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
 }

 void VKScheduler::AllocateNewContext() {
+    ++ticks;
+
    std::unique_lock lock{mutex};
    current_fence = next_fence;
    next_fence = &resource_manager.CommitFence();
@@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() {
    current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
    current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},
                         device.GetDispatchLoader());
+    // Enable counters once again. These are disabled when a command buffer is finished.
+    if (query_cache) {
+        query_cache->UpdateCounters();
+    }
 }

 void VKScheduler::InvalidateState() {
@@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() {
 }

 void VKScheduler::EndPendingOperations() {
+    query_cache->DisableStreams();
    EndRenderPass();
 }

@@ -4,6 +4,7 @@

 #pragma once

+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <optional>
@@ -18,6 +19,7 @@ namespace Vulkan {

 class VKDevice;
 class VKFence;
+class VKQueryCache;
 class VKResourceManager;

 class VKFenceView {
@@ -67,6 +69,11 @@ public:
    /// Binds a pipeline to the current execution context.
    void BindGraphicsPipeline(vk::Pipeline pipeline);

+    /// Assigns the query cache.
+    void SetQueryCache(VKQueryCache& query_cache_) {
+        query_cache = &query_cache_;
+    }
+
    /// Returns true when viewports have been set in the current command buffer.
    bool TouchViewports() {
        return std::exchange(state.viewports, true);
@@ -112,6 +119,11 @@ public:
        return current_fence;
    }

+    /// Returns the current command buffer tick.
+    u64 Ticks() const {
+        return ticks;
+    }
+
 private:
    class Command {
    public:
@@ -205,6 +217,8 @@ private:

    const VKDevice& device;
    VKResourceManager& resource_manager;
+    VKQueryCache* query_cache = nullptr;
+
    vk::CommandBuffer current_cmdbuf;
    VKFence* current_fence = nullptr;
    VKFence* next_fence = nullptr;
@@ -227,6 +241,7 @@ private:
    Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
    std::mutex mutex;
    std::condition_variable cv;
+    std::atomic<u64> ticks = 0;
    bool quit = false;
 };

@@ -69,8 +69,9 @@ struct TexelBuffer {

 struct SampledImage {
    Id image_type{};
-    Id sampled_image_type{};
-    Id sampler{};
+    Id sampler_type{};
+    Id sampler_pointer_type{};
+    Id variable{};
 };

 struct StorageImage {
@@ -86,6 +87,7 @@ struct AttributeType {

 struct VertexIndices {
    std::optional<u32> position;
+    std::optional<u32> layer;
    std::optional<u32> viewport;
    std::optional<u32> point_size;
    std::optional<u32> clip_distances;
@@ -275,21 +277,29 @@ public:
        AddCapability(spv::Capability::ImageGatherExtended);
        AddCapability(spv::Capability::SampledBuffer);
        AddCapability(spv::Capability::StorageImageWriteWithoutFormat);
+        AddCapability(spv::Capability::DrawParameters);
        AddCapability(spv::Capability::SubgroupBallotKHR);
        AddCapability(spv::Capability::SubgroupVoteKHR);
        AddExtension("SPV_KHR_shader_ballot");
        AddExtension("SPV_KHR_subgroup_vote");
        AddExtension("SPV_KHR_storage_buffer_storage_class");
        AddExtension("SPV_KHR_variable_pointers");
+        AddExtension("SPV_KHR_shader_draw_parameters");

-        if (ir.UsesViewportIndex()) {
-            AddCapability(spv::Capability::MultiViewport);
-            if (device.IsExtShaderViewportIndexLayerSupported()) {
+        if (ir.UsesLayer() || ir.UsesViewportIndex()) {
+            if (ir.UsesViewportIndex()) {
+                AddCapability(spv::Capability::MultiViewport);
+            }
+            if (stage != ShaderType::Geometry && device.IsExtShaderViewportIndexLayerSupported()) {
                AddExtension("SPV_EXT_shader_viewport_index_layer");
                AddCapability(spv::Capability::ShaderViewportIndexLayerEXT);
            }
        }

+        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            AddCapability(spv::Capability::StorageImageReadWithoutFormat);
+        }
+
        if (device.IsFloat16Supported()) {
            AddCapability(spv::Capability::Float16);
        }
@@ -492,9 +502,11 @@ private:
        interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex")));

        // Declare input attributes
-        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index");
+        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index");
        instance_index =
-            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index");
+            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index");
+        base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex");
+        base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance");
    }

    void DeclareTessControl() {
@@ -822,16 +834,20 @@ private:
            constexpr int sampled = 1;
            constexpr auto format = spv::ImageFormat::Unknown;
            const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format);
-            const Id sampled_image_type = TypeSampledImage(image_type);
-            const Id pointer_type =
-                TypePointer(spv::StorageClass::UniformConstant, sampled_image_type);
+            const Id sampler_type = TypeSampledImage(image_type);
+            const Id sampler_pointer_type =
+                TypePointer(spv::StorageClass::UniformConstant, sampler_type);
+            const Id type = sampler.IsIndexed()
+                                ? TypeArray(sampler_type, Constant(t_uint, sampler.Size()))
+                                : sampler_type;
+            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type);
            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
            AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex())));
            Decorate(id, spv::Decoration::Binding, binding++);
            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);

-            sampled_images.emplace(sampler.GetIndex(),
-                                   SampledImage{image_type, sampled_image_type, id});
+            sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type,
+                                                                    sampler_pointer_type, id});
        }
        return binding;
    }
@@ -920,13 +936,22 @@ private:
        VertexIndices indices;
        indices.position = AddBuiltIn(t_float4, spv::BuiltIn::Position, "position");

+        if (ir.UsesLayer()) {
+            if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) {
+                indices.layer = AddBuiltIn(t_int, spv::BuiltIn::Layer, "layer");
+            } else {
+                LOG_ERROR(
+                    Render_Vulkan,
+                    "Shader requires Layer but it's not supported on this stage with this device.");
+            }
+        }
+
        if (ir.UsesViewportIndex()) {
            if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) {
                indices.viewport = AddBuiltIn(t_int, spv::BuiltIn::ViewportIndex, "viewport_index");
            } else {
-                LOG_ERROR(Render_Vulkan,
-                          "Shader requires ViewportIndex but it's not supported on this "
-                          "stage with this device.");
+                LOG_ERROR(Render_Vulkan, "Shader requires ViewportIndex but it's not supported on "
+                                         "this stage with this device.");
            }
        }

@@ -1068,9 +1093,12 @@ private:
                    return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)),
                            Type::Float};
                case 2:
-                    return {OpLoad(t_uint, instance_index), Type::Uint};
+                    return {
+                        OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)),
+                        Type::Int};
                case 3:
-                    return {OpLoad(t_uint, vertex_index), Type::Uint};
+                    return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)),
+                            Type::Int};
                }
                UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
                return {Constant(t_uint, 0U), Type::Uint};
@@ -1285,6 +1313,13 @@ private:
                }
                case Attribute::Index::LayerViewportPointSize:
                    switch (element) {
+                    case 1: {
+                        if (!out_indices.layer) {
+                            return {};
+                        }
+                        const u32 index = out_indices.layer.value();
+                        return {AccessElement(t_out_int, out_vertex, index), Type::Int};
+                    }
                    case 2: {
                        if (!out_indices.viewport) {
                            return {};
@@ -1355,6 +1390,11 @@ private:
            UNIMPLEMENTED();
        }

+        if (!target.id) {
+            // On failure we return a nullptr target.id, skip these stores.
+            return {};
+        }
+
        OpStore(target.id, As(Visit(src), target.type));
        return {};
    }
@@ -1490,7 +1530,12 @@ private:
        ASSERT(!meta.sampler.IsBuffer());

        const auto& entry = sampled_images.at(meta.sampler.GetIndex());
-        return OpLoad(entry.sampled_image_type, entry.sampler);
+        Id sampler = entry.variable;
+        if (meta.sampler.IsIndexed()) {
+            const Id index = AsInt(Visit(meta.index));
+            sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index);
+        }
+        return OpLoad(entry.sampler_type, sampler);
    }

    Id GetTextureImage(Operation operation) {
@@ -1748,8 +1793,16 @@ private:
    }

    Expression ImageLoad(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            return {v_float_zero, Type::Float};
+        }
+
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+
+        const Id coords = GetCoordinates(operation, Type::Int);
+        const Id texel = OpImageRead(t_uint4, GetImage(operation), coords);
+
+        return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint};
    }

    Expression ImageStore(Operation operation) {
@@ -2168,16 +2221,14 @@ private:
        switch (specialization.attribute_types.at(location)) {
        case Maxwell::VertexAttribute::Type::SignedNorm:
        case Maxwell::VertexAttribute::Type::UnsignedNorm:
+        case Maxwell::VertexAttribute::Type::UnsignedScaled:
+        case Maxwell::VertexAttribute::Type::SignedScaled:
        case Maxwell::VertexAttribute::Type::Float:
            return {Type::Float, t_in_float, t_in_float4};
        case Maxwell::VertexAttribute::Type::SignedInt:
            return {Type::Int, t_in_int, t_in_int4};
        case Maxwell::VertexAttribute::Type::UnsignedInt:
            return {Type::Uint, t_in_uint, t_in_uint4};
-        case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        case Maxwell::VertexAttribute::Type::SignedScaled:
-            UNIMPLEMENTED();
-            return {Type::Float, t_in_float, t_in_float4};
        default:
            UNREACHABLE();
            return {Type::Float, t_in_float, t_in_float4};
@@ -2542,6 +2593,8 @@ private:

    Id instance_index{};
    Id vertex_index{};
+    Id base_instance{};
+    Id base_vertex{};
    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
    Id frag_depth{};
    Id frag_coord{};
@@ -141,11 +141,6 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities

    const vk::SurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats, srgb)};
    const vk::PresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)};
-    extent = ChooseSwapExtent(capabilities, width, height);
-
-    current_width = extent.width;
-    current_height = extent.height;
-    current_srgb = srgb;

    u32 requested_image_count{capabilities.minImageCount + 1};
    if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) {
@@ -153,10 +148,9 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities
    }

    vk::SwapchainCreateInfoKHR swapchain_ci(
-        {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace,
-        extent, 1, vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {},
-        capabilities.currentTransform, vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false,
-        {});
+        {}, surface, requested_image_count, surface_format.format, surface_format.colorSpace, {}, 1,
+        vk::ImageUsageFlagBits::eColorAttachment, {}, {}, {}, capabilities.currentTransform,
+        vk::CompositeAlphaFlagBitsKHR::eOpaque, present_mode, false, {});

    const u32 graphics_family{device.GetGraphicsFamily()};
    const u32 present_family{device.GetPresentFamily()};
@@ -169,9 +163,18 @@ void VKSwapchain::CreateSwapchain(const vk::SurfaceCapabilitiesKHR& capabilities
        swapchain_ci.imageSharingMode = vk::SharingMode::eExclusive;
    }

+    // Request the size again to reduce the possibility of a TOCTOU race condition.
+    const auto updated_capabilities = physical_device.getSurfaceCapabilitiesKHR(surface, dld);
+    swapchain_ci.imageExtent = ChooseSwapExtent(updated_capabilities, width, height);
+    // Don't add code within this and the swapchain creation.
    const auto dev{device.GetLogical()};
    swapchain = dev.createSwapchainKHRUnique(swapchain_ci, nullptr, dld);

+    extent = swapchain_ci.imageExtent;
+    current_width = extent.width;
+    current_height = extent.height;
+    current_srgb = srgb;
+
    images = dev.getSwapchainImagesKHR(*swapchain, dld);
    image_count = static_cast<u32>(images.size());
    image_format = surface_format.format;
@@ -151,6 +151,10 @@ public:
        return params.GetMipHeight(base_level);
    }

+    u32 GetNumLayers() const {
+        return num_layers;
+    }
+
    bool IsBufferView() const {
        return buffer_view;
    }
@@ -53,29 +53,24 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {

        op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);

-        // TODO(Rodrigo): Should precise be used when there's a postfactor?
-        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+        static constexpr std::array FmulPostFactor = {
+            1.000f, // None
+            0.500f, // Divide 2
+            0.250f, // Divide 4
+            0.125f, // Divide 8
+            8.000f, // Mul 8
+            4.000f, // Mul 4
+            2.000f, // Mul 2
+        };

        if (instr.fmul.postfactor != 0) {
-            auto postfactor = static_cast<s32>(instr.fmul.postfactor);
-
-            // Postfactor encoded as 3-bit 1's complement in instruction, interpreted with below
-            // logic.
-            if (postfactor >= 4) {
-                postfactor = 7 - postfactor;
-            } else {
-                postfactor = 0 - postfactor;
-            }
-
-            if (postfactor > 0) {
-                value = Operation(OperationCode::FMul, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << postfactor)));
-            } else {
-                value = Operation(OperationCode::FDiv, NO_PRECISE, value,
-                                  Immediate(static_cast<f32>(1 << -postfactor)));
-            }
+            op_a = Operation(OperationCode::FMul, NO_PRECISE, op_a,
+                             Immediate(FmulPostFactor[instr.fmul.postfactor]));
        }

+        // TODO(Rodrigo): Should precise be used when there's a postfactor?
+        Node value = Operation(OperationCode::FMul, PRECISE, op_a, op_b);
+
        value = GetSaturatedFloat(value, instr.alu.saturate_d);

        SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
--- a/Show More
+++ b/Show More