Compare commits

..

2 Commits

Author SHA1 Message Date
c1ed150949 Kernel: Wake cores from idle directly rather than through a host thread (#6837)
* Kernel: Wake cores from idle directly rather than through a host thread

Right now when a core enters an idle state, leaving that idle state requires us to first signal the core's idle thread, which then signals the correct thread that we want to run on the core. This means that in a lot of cases, we're paying double for a thread to be woken from an idle state.

This PR moves this process to happen on the thread that is waking others out of idle, instead of an idle thread that needs to be woken first.

For compatibility the process has been kept as similar as possible - the process for IdleThreadLoop has been migrated to TryLeaveIdle, and is gated by a condition variable that lets it run only once at a time for each core. A core is only considered for wake from idle if idle is both active and has been signalled - the signal is consumed and the active state is cleared when the core leaves idle.

Dummy threads (just the idle thread at the moment) have been changed to have no host thread, as the work is now done by threads entering idle and signalling out of it.

This could put a bit of extra work on threads that would have triggered `_idleInterruptEvent` before, but I'd expect less work than signalling all those reset events and the OS overhead that follows. Worst case is that other threads performing these signals at the same time will have to wait for each other, but it's still going to be a very short amount of time.

Improvements are best seen in games with heavy (or very misguided) multithreading, such as Pokemon: Legends Arceus. Improvements are expected in Scarlet/Violet and TOTK, but are harder to measure.

Testing on Linux/MacOS still to be done, definitely need to test more games as this affects all of them (obviously) and any issues might be rare to encounter.

* Remove _idleThread entirely

* Use spinwait so we don't completely blast the CPU with cmpxchg

* Didn't I already do this

* Cleanup
2024-05-22 17:47:27 -03:00
c634eb4054 Updating Concentus dependency to speed up Opus decoding (#6757)
* Implementing new features in the latest Concentus library - span-in, span-out Opus decoding (so we don't have to make temporary buffer copies), returning a more precise error code from the decoder, and automatically linking the native opus library with P/invoke if supported on the current system

* Remove stub log messages and commit package upgrade to 2.1.0

* use more correct disposal pattern

* Bump to Concentus 2.1.1

* Bump to Concentus 2.1.2

* Don't bother pulling in native opus binaries from Concentus package (using ExcludeAssets).

* Fix opus MS channel count. Explicitly disable native lib probe in OpusCodecFactory.

* Bump to package 2.2.0 which has split out the native libs, as suggested.

---------

Co-authored-by: Logan Stromberg <lostromb@microsoft.com>
2024-05-20 18:38:38 -03:00
6 changed files with 159 additions and 103 deletions

View File

@ -11,7 +11,7 @@
<PackageVersion Include="Avalonia.Svg" Version="11.0.0.18" />
<PackageVersion Include="Avalonia.Svg.Skia" Version="11.0.0.18" />
<PackageVersion Include="CommandLineParser" Version="2.9.1" />
<PackageVersion Include="Concentus" Version="1.1.7" />
<PackageVersion Include="Concentus" Version="2.2.0" />
<PackageVersion Include="DiscordRichPresence" Version="1.2.1.24" />
<PackageVersion Include="DynamicData" Version="8.4.1" />
<PackageVersion Include="FluentAvaloniaUI" Version="2.0.5" />
@ -49,4 +49,4 @@
<PackageVersion Include="System.Management" Version="8.0.0" />
<PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-fb78016" />
</ItemGroup>
</Project>
</Project>

View File

@ -28,42 +28,25 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
private SchedulingState _state;
private AutoResetEvent _idleInterruptEvent;
private readonly object _idleInterruptEventLock;
private KThread _previousThread;
private KThread _currentThread;
private readonly KThread _idleThread;
private int _coreIdleLock;
private bool _idleSignalled = true;
private bool _idleActive = true;
private long _idleTimeRunning;
public KThread PreviousThread => _previousThread;
public KThread CurrentThread => _currentThread;
public long LastContextSwitchTime { get; private set; }
public long TotalIdleTimeTicks => _idleThread.TotalTimeRunning;
public long TotalIdleTimeTicks => _idleTimeRunning;
public KScheduler(KernelContext context, int coreId)
{
_context = context;
_coreId = coreId;
_idleInterruptEvent = new AutoResetEvent(false);
_idleInterruptEventLock = new object();
KThread idleThread = CreateIdleThread(context, coreId);
_currentThread = idleThread;
_idleThread = idleThread;
idleThread.StartHostThread();
idleThread.SchedulerWaitEvent.Set();
}
private KThread CreateIdleThread(KernelContext context, int cpuCore)
{
KThread idleThread = new(context);
idleThread.Initialize(0UL, 0UL, 0UL, PrioritiesCount, cpuCore, null, ThreadType.Dummy, IdleThreadLoop);
return idleThread;
_currentThread = null;
}
public static ulong SelectThreads(KernelContext context)
@ -237,39 +220,64 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
KThread threadToSignal = context.Schedulers[coreToSignal]._currentThread;
// Request the thread running on that core to stop and reschedule, if we have one.
if (threadToSignal != context.Schedulers[coreToSignal]._idleThread)
{
threadToSignal.Context.RequestInterrupt();
}
threadToSignal?.Context.RequestInterrupt();
// If the core is idle, ensure that the idle thread is awaken.
context.Schedulers[coreToSignal]._idleInterruptEvent.Set();
context.Schedulers[coreToSignal].NotifyIdleThread();
scheduledCoresMask &= ~(1UL << coreToSignal);
}
}
private void IdleThreadLoop()
private void ActivateIdleThread()
{
while (_context.Running)
while (Interlocked.CompareExchange(ref _coreIdleLock, 1, 0) != 0)
{
Thread.SpinWait(1);
}
Thread.MemoryBarrier();
// Signals that idle thread is now active on this core.
_idleActive = true;
TryLeaveIdle();
Interlocked.Exchange(ref _coreIdleLock, 0);
}
private void NotifyIdleThread()
{
while (Interlocked.CompareExchange(ref _coreIdleLock, 1, 0) != 0)
{
Thread.SpinWait(1);
}
Thread.MemoryBarrier();
// Signals that the idle core may be able to exit idle.
_idleSignalled = true;
TryLeaveIdle();
Interlocked.Exchange(ref _coreIdleLock, 0);
}
public void TryLeaveIdle()
{
if (_idleSignalled && _idleActive)
{
_state.NeedsScheduling = false;
Thread.MemoryBarrier();
KThread nextThread = PickNextThread(_state.SelectedThread);
KThread nextThread = PickNextThread(null, _state.SelectedThread);
if (_idleThread != nextThread)
if (nextThread != null)
{
_idleThread.SchedulerWaitEvent.Reset();
WaitHandle.SignalAndWait(nextThread.SchedulerWaitEvent, _idleThread.SchedulerWaitEvent);
_idleActive = false;
nextThread.SchedulerWaitEvent.Set();
}
_idleInterruptEvent.WaitOne();
}
lock (_idleInterruptEventLock)
{
_idleInterruptEvent.Dispose();
_idleInterruptEvent = null;
_idleSignalled = false;
}
}
@ -292,20 +300,37 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
// Wake all the threads that might be waiting until this thread context is unlocked.
for (int core = 0; core < CpuCoresCount; core++)
{
_context.Schedulers[core]._idleInterruptEvent.Set();
_context.Schedulers[core].NotifyIdleThread();
}
KThread nextThread = PickNextThread(selectedThread);
KThread nextThread = PickNextThread(KernelStatic.GetCurrentThread(), selectedThread);
if (currentThread.Context.Running)
{
// Wait until this thread is scheduled again, and allow the next thread to run.
WaitHandle.SignalAndWait(nextThread.SchedulerWaitEvent, currentThread.SchedulerWaitEvent);
if (nextThread == null)
{
ActivateIdleThread();
currentThread.SchedulerWaitEvent.WaitOne();
}
else
{
WaitHandle.SignalAndWait(nextThread.SchedulerWaitEvent, currentThread.SchedulerWaitEvent);
}
}
else
{
// Allow the next thread to run.
nextThread.SchedulerWaitEvent.Set();
if (nextThread == null)
{
ActivateIdleThread();
}
else
{
nextThread.SchedulerWaitEvent.Set();
}
// We don't need to wait since the thread is exiting, however we need to
// make sure this thread will never call the scheduler again, since it is
@ -319,7 +344,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
}
}
private KThread PickNextThread(KThread selectedThread)
private KThread PickNextThread(KThread currentThread, KThread selectedThread)
{
while (true)
{
@ -335,7 +360,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
// on the core, as the scheduled thread will handle the next switch.
if (selectedThread.ThreadContext.Lock())
{
SwitchTo(selectedThread);
SwitchTo(currentThread, selectedThread);
if (!_state.NeedsScheduling)
{
@ -346,15 +371,15 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
}
else
{
return _idleThread;
return null;
}
}
else
{
// The core is idle now, make sure that the idle thread can run
// and switch the core when a thread is available.
SwitchTo(null);
return _idleThread;
SwitchTo(currentThread, null);
return null;
}
_state.NeedsScheduling = false;
@ -363,12 +388,9 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
}
}
private void SwitchTo(KThread nextThread)
private void SwitchTo(KThread currentThread, KThread nextThread)
{
KProcess currentProcess = KernelStatic.GetCurrentProcess();
KThread currentThread = KernelStatic.GetCurrentThread();
nextThread ??= _idleThread;
KProcess currentProcess = currentThread?.Owner;
if (currentThread != nextThread)
{
@ -376,7 +398,14 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
long currentTicks = PerformanceCounter.ElapsedTicks;
long ticksDelta = currentTicks - previousTicks;
currentThread.AddCpuTime(ticksDelta);
if (currentThread == null)
{
Interlocked.Add(ref _idleTimeRunning, ticksDelta);
}
else
{
currentThread.AddCpuTime(ticksDelta);
}
currentProcess?.AddCpuTime(ticksDelta);
@ -386,13 +415,13 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
{
_previousThread = !currentThread.TerminationRequested && currentThread.ActiveCore == _coreId ? currentThread : null;
}
else if (currentThread == _idleThread)
else if (currentThread == null)
{
_previousThread = null;
}
}
if (nextThread.CurrentCore != _coreId)
if (nextThread != null && nextThread.CurrentCore != _coreId)
{
nextThread.CurrentCore = _coreId;
}
@ -645,11 +674,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
public void Dispose()
{
// Ensure that the idle thread is not blocked and can exit.
lock (_idleInterruptEventLock)
{
_idleInterruptEvent?.Set();
}
// No resources to dispose for now.
}
}
}

View File

@ -143,9 +143,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
PreferredCore = cpuCore;
AffinityMask |= 1UL << cpuCore;
SchedFlags = type == ThreadType.Dummy
? ThreadSchedState.Running
: ThreadSchedState.None;
SchedFlags = ThreadSchedState.None;
ActiveCore = cpuCore;
ObjSyncResult = KernelResult.ThreadNotStarted;
@ -1055,6 +1053,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
// If the thread is not schedulable, we want to just run or pause
// it directly as we don't care about priority or the core it is
// running on in this case.
if (SchedFlags == ThreadSchedState.Running)
{
_schedulerWaitEvent.Set();

View File

@ -2,7 +2,6 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
{
enum ThreadType
{
Dummy,
Kernel,
Kernel2,
User,

View File

@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
@ -16,10 +16,4 @@
<PackageReference Include="Concentus" />
<PackageReference Include="LibHac" />
</ItemGroup>
<!-- Due to Concentus. -->
<PropertyGroup>
<NoWarn>NU1605</NoWarn>
</PropertyGroup>
</Project>

View File

@ -14,6 +14,11 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
{
partial class HardwareOpusDecoder : IHardwareOpusDecoder, IDisposable
{
static HardwareOpusDecoder()
{
OpusCodecFactory.AttemptToUseNativeLibrary = false;
}
[StructLayout(LayoutKind.Sequential)]
private struct OpusPacketHeader
{
@ -30,60 +35,87 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
}
}
private interface IDecoder
private interface IDecoder : IDisposable
{
int SampleRate { get; }
int ChannelsCount { get; }
int Decode(byte[] inData, int inDataOffset, int len, short[] outPcm, int outPcmOffset, int frameSize);
int Decode(ReadOnlySpan<byte> inData, Span<short> outPcm, int frameSize);
void ResetState();
}
private class Decoder : IDecoder
{
private readonly OpusDecoder _decoder;
private readonly IOpusDecoder _decoder;
public int SampleRate => _decoder.SampleRate;
public int ChannelsCount => _decoder.NumChannels;
public Decoder(int sampleRate, int channelsCount)
{
_decoder = new OpusDecoder(sampleRate, channelsCount);
_decoder = OpusCodecFactory.CreateDecoder(sampleRate, channelsCount);
}
public int Decode(byte[] inData, int inDataOffset, int len, short[] outPcm, int outPcmOffset, int frameSize)
public int Decode(ReadOnlySpan<byte> inData, Span<short> outPcm, int frameSize)
{
return _decoder.Decode(inData, inDataOffset, len, outPcm, outPcmOffset, frameSize);
return _decoder.Decode(inData, outPcm, frameSize);
}
public void ResetState()
{
_decoder.ResetState();
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
_decoder?.Dispose();
}
}
}
private class MultiSampleDecoder : IDecoder
{
private readonly OpusMSDecoder _decoder;
private readonly IOpusMultiStreamDecoder _decoder;
public int SampleRate => _decoder.SampleRate;
public int ChannelsCount { get; }
public int ChannelsCount => _decoder.NumChannels;
public MultiSampleDecoder(int sampleRate, int channelsCount, int streams, int coupledStreams, byte[] mapping)
{
ChannelsCount = channelsCount;
_decoder = new OpusMSDecoder(sampleRate, channelsCount, streams, coupledStreams, mapping);
_decoder = OpusCodecFactory.CreateMultiStreamDecoder(sampleRate, channelsCount, streams, coupledStreams, mapping);
}
public int Decode(byte[] inData, int inDataOffset, int len, short[] outPcm, int outPcmOffset, int frameSize)
public int Decode(ReadOnlySpan<byte> inData, Span<short> outPcm, int frameSize)
{
return _decoder.DecodeMultistream(inData, inDataOffset, len, outPcm, outPcmOffset, frameSize, 0);
return _decoder.DecodeMultistream(inData, outPcm, frameSize, false);
}
public void ResetState()
{
_decoder.ResetState();
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
_decoder?.Dispose();
}
}
}
private readonly IDecoder _decoder;
@ -221,7 +253,8 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
{
timeTaken = 0;
Result result = DecodeInterleaved(_decoder, reset, input, out short[] outPcmData, output.Length, out outConsumed, out outSamples);
Span<short> outPcmSpace = MemoryMarshal.Cast<byte, short>(output);
Result result = DecodeInterleaved(_decoder, reset, input, outPcmSpace, output.Length, out outConsumed, out outSamples);
if (withPerf)
{
@ -229,14 +262,12 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
timeTaken = 0;
}
MemoryMarshal.Cast<short, byte>(outPcmData).CopyTo(output[..(outPcmData.Length * sizeof(short))]);
return result;
}
private static Result GetPacketNumSamples(IDecoder decoder, out int numSamples, byte[] packet)
private static Result GetPacketNumSamples(IDecoder decoder, out int numSamples, ReadOnlySpan<byte> packet)
{
int result = OpusPacketInfo.GetNumSamples(packet, 0, packet.Length, decoder.SampleRate);
int result = OpusPacketInfo.GetNumSamples(packet, decoder.SampleRate);
numSamples = result;
@ -256,12 +287,11 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
IDecoder decoder,
bool reset,
ReadOnlySpan<byte> input,
out short[] outPcmData,
Span<short> outPcmData,
int outputSize,
out int outConsumed,
out int outSamples)
{
outPcmData = null;
outConsumed = 0;
outSamples = 0;
@ -281,7 +311,7 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
return CodecResult.InvalidLength;
}
byte[] opusData = input.Slice(headerSize, (int)header.Length).ToArray();
ReadOnlySpan<byte> opusData = input.Slice(headerSize, (int)header.Length);
Result result = GetPacketNumSamples(decoder, out int numSamples, opusData);
@ -292,8 +322,6 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
return CodecResult.InvalidLength;
}
outPcmData = new short[numSamples * decoder.ChannelsCount];
if (reset)
{
decoder.ResetState();
@ -301,13 +329,22 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
try
{
outSamples = decoder.Decode(opusData, 0, opusData.Length, outPcmData, 0, outPcmData.Length / decoder.ChannelsCount);
outSamples = decoder.Decode(opusData, outPcmData, numSamples);
outConsumed = (int)totalSize;
}
catch (OpusException)
catch (OpusException e)
{
// TODO: As OpusException doesn't return the exact error code, this is inaccurate in some cases...
return CodecResult.InvalidLength;
switch (e.OpusErrorCode)
{
case OpusError.OPUS_BUFFER_TOO_SMALL:
return CodecResult.InvalidLength;
case OpusError.OPUS_BAD_ARG:
return CodecResult.OpusBadArg;
case OpusError.OPUS_INVALID_PACKET:
return CodecResult.OpusInvalidPacket;
default:
return CodecResult.InvalidLength;
}
}
}
@ -324,6 +361,8 @@ namespace Ryujinx.Horizon.Sdk.Codec.Detail
_workBufferHandle = 0;
}
_decoder?.Dispose();
}
}