Compare commits

..

8 Commits

Author SHA1 Message Date
c52158b733 Add timestamp to 16-byte/4-word semaphore releases. (#3049)
* Add timestamp to 16-byte semaphore releases.

BOTW was reading a ulong 8 bytes after a semaphore return. Turns out this is the timestamp it was trying to do performance calculation with, so I've made it write when necessary.

This mode was also added to the DMA semaphore I added recently, as it is required by a few games. (i think quake?)

The timestamp code has been moved to GPU context. Check other games with an unusually low framerate cap or dynamic resolution to see if they have improved.

* Cast dma semaphore payload to ulong to fill the space

* Write timestamp first

Might be just worrying too much, but we don't want the applcation reading timestamp if it sees the payload before timestamp is written.
2022-01-27 22:50:32 +01:00
fd6d3ec88f Fix res scale parameters not being updated in vertex shader (#3046)
This fixes an issue where the render scale array would not be updated when technically the scales on the flat array were the same, but the start index for the vertex scales was different.
2022-01-27 14:17:13 -03:00
0a0a95fd81 Convert Octal-Mode to Decimal (#3041)
Apparently C# doesn't use 0 as a prefix like C does.
2022-01-25 23:31:04 +01:00
26019c7d06 Fix regression on PR builds version number since new release system 2022-01-24 18:49:14 +01:00
f3bfd799e1 Fix calls passing V128 values on Linux (#3034)
* Fix calls passing V128 values on Linux

* PPTC version bump
2022-01-24 11:23:24 +01:00
b2ebbe8b22 amadeus: Fix possible device sink input out of bound (#3032)
This fix an out of bound when indexing inputs for games that uses
unsupported values (8 here)

Close #2724.
2022-01-23 23:36:31 +01:00
4910b214f5 Set _vibrationPermitted to True by default (#2985)
Co-authored-by: SpookyBee123 <82302189+SpookyBee123@users.noreply.github.com>
2022-01-23 12:24:55 +01:00
42c75dbb8f Add support for BC1/2/3 decompression (for 3D textures) (#2987)
* Add support for BC1/2/3 decompression (for 3D textures)

* Optimize and clean up

* Unsafe not needed here

* Fix alpha value interpolation when a0 <= a1
2022-01-22 19:23:00 +01:00
18 changed files with 804 additions and 203 deletions

View File

@ -796,6 +796,8 @@ namespace ARMeilleure.CodeGen.X86
}
}
node.SetSources(sources.ToArray());
if (dest != default)
{
if (dest.Type == OperandType.V128)
@ -823,8 +825,6 @@ namespace ARMeilleure.CodeGen.X86
node.Destination = retReg;
}
}
node.SetSources(sources.ToArray());
}
private static void HandleTailcallSystemVAbi(IntrusiveList<Operation> nodes, StackAllocator stackAlloc, Operation node)

View File

@ -27,7 +27,7 @@ namespace ARMeilleure.Translation.PTC
private const string OuterHeaderMagicString = "PTCohd\0\0";
private const string InnerHeaderMagicString = "PTCihd\0\0";
private const uint InternalVersion = 3015; //! To be incremented manually for each change to the ARMeilleure project.
private const uint InternalVersion = 3034; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";

View File

@ -52,7 +52,7 @@ namespace Ryujinx.Audio.Renderer.Dsp.Command
InputCount = sink.Parameter.InputCount;
InputBufferIndices = new ushort[InputCount];
for (int i = 0; i < InputCount; i++)
for (int i = 0; i < Math.Min(InputCount, Constants.ChannelCountMax); i++)
{
InputBufferIndices[i] = (ushort)(bufferOffset + sink.Parameter.Input[i]);
}

View File

@ -1,4 +1,6 @@
namespace Ryujinx.Common
using System.Reflection;
namespace Ryujinx.Common
{
// DO NOT EDIT, filled by CI
public static class ReleaseInformations
@ -25,7 +27,7 @@
}
else
{
return "1.0.0-dirty";
return Assembly.GetEntryAssembly().GetCustomAttribute<AssemblyInformationalVersionAttribute>().InformationalVersion;
}
}
}

View File

@ -2,30 +2,32 @@ namespace Ryujinx.Graphics.GAL
{
public struct Capabilities
{
public bool HasFrontFacingBug { get; }
public bool HasVectorIndexingBug { get; }
public readonly bool HasFrontFacingBug;
public readonly bool HasVectorIndexingBug;
public bool SupportsAstcCompression { get; }
public bool SupportsBgraFormat { get; }
public bool SupportsR4G4Format { get; }
public bool SupportsFragmentShaderInterlock { get; }
public bool SupportsFragmentShaderOrderingIntel { get; }
public bool SupportsImageLoadFormatted { get; }
public bool SupportsMismatchingViewFormat { get; }
public bool SupportsNonConstantTextureOffset { get; }
public bool SupportsShaderBallot { get; }
public bool SupportsTextureShadowLod { get; }
public bool SupportsViewportSwizzle { get; }
public bool SupportsIndirectParameters { get; }
public readonly bool SupportsAstcCompression;
public readonly bool Supports3DTextureCompression;
public readonly bool SupportsBgraFormat;
public readonly bool SupportsR4G4Format;
public readonly bool SupportsFragmentShaderInterlock;
public readonly bool SupportsFragmentShaderOrderingIntel;
public readonly bool SupportsImageLoadFormatted;
public readonly bool SupportsMismatchingViewFormat;
public readonly bool SupportsNonConstantTextureOffset;
public readonly bool SupportsShaderBallot;
public readonly bool SupportsTextureShadowLod;
public readonly bool SupportsViewportSwizzle;
public readonly bool SupportsIndirectParameters;
public int MaximumComputeSharedMemorySize { get; }
public float MaximumSupportedAnisotropy { get; }
public int StorageBufferOffsetAlignment { get; }
public readonly int MaximumComputeSharedMemorySize;
public readonly float MaximumSupportedAnisotropy;
public readonly int StorageBufferOffsetAlignment;
public Capabilities(
bool hasFrontFacingBug,
bool hasVectorIndexingBug,
bool supportsAstcCompression,
bool supports3DTextureCompression,
bool supportsBgraFormat,
bool supportsR4G4Format,
bool supportsFragmentShaderInterlock,
@ -44,6 +46,7 @@ namespace Ryujinx.Graphics.GAL
HasFrontFacingBug = hasFrontFacingBug;
HasVectorIndexingBug = hasVectorIndexingBug;
SupportsAstcCompression = supportsAstcCompression;
Supports3DTextureCompression = supports3DTextureCompression;
SupportsBgraFormat = supportsBgraFormat;
SupportsR4G4Format = supportsR4G4Format;
SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock;

View File

@ -67,11 +67,9 @@ namespace Ryujinx.Graphics.GAL
R10G10B10A2Uint,
R11G11B10Float,
R9G9B9E5Float,
Bc1RgbUnorm,
Bc1RgbaUnorm,
Bc2Unorm,
Bc3Unorm,
Bc1RgbSrgb,
Bc1RgbaSrgb,
Bc2Srgb,
Bc3Srgb,
@ -349,25 +347,5 @@ namespace Ryujinx.Graphics.GAL
{
return format.IsUint() || format.IsSint();
}
/// <summary>
/// Checks if the texture format is a BC4 compressed format.
/// </summary>
/// <param name="format">Texture format</param>
/// <returns>True if the texture format is a BC4 compressed format, false otherwise</returns>
public static bool IsBc4(this Format format)
{
return format == Format.Bc4Unorm || format == Format.Bc4Snorm;
}
/// <summary>
/// Checks if the texture format is a BC5 compressed format.
/// </summary>
/// <param name="format">Texture format</param>
/// <returns>True if the texture format is a BC5 compressed format, false otherwise</returns>
public static bool IsBc5(this Format format)
{
return format == Format.Bc5Unorm || format == Format.Bc5Snorm;
}
}
}

View File

@ -115,7 +115,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
}
else /* if (type == LaunchDmaSemaphoreType.ReleaseFourWordSemaphore) */
{
Logger.Warning?.Print(LogClass.Gpu, "DMA semaphore type ReleaseFourWordSemaphore was used, but is not currently implemented.");
_channel.MemoryManager.Write(address + 8, _context.GetTimestamp());
_channel.MemoryManager.Write(address, (ulong)_state.State.SetSemaphorePayload);
}
}
}

View File

@ -75,6 +75,12 @@ namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
SemaphoredOperation operation = _state.State.SemaphoredOperation;
if (_state.State.SemaphoredReleaseSize == SemaphoredReleaseSize.SixteenBytes)
{
_parent.MemoryManager.Write(address + 4, 0);
_parent.MemoryManager.Write(address + 8, _context.GetTimestamp());
}
// TODO: Acquire operations (Wait), interrupts for invalid combinations.
if (operation == SemaphoredOperation.Release)
{

View File

@ -1,6 +1,4 @@
using Ryujinx.Common;
using Ryujinx.Graphics.GAL;
using System.Runtime.InteropServices;
using Ryujinx.Graphics.GAL;
namespace Ryujinx.Graphics.Gpu.Engine.Threed
{
@ -9,9 +7,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
/// </summary>
class SemaphoreUpdater
{
private const int NsToTicksFractionNumerator = 384;
private const int NsToTicksFractionDenominator = 625;
/// <summary>
/// GPU semaphore operation.
/// </summary>
@ -154,14 +149,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
{
ulong gpuVa = _state.State.SemaphoreAddress.Pack();
ulong ticks = ConvertNanosecondsToTicks((ulong)PerformanceCounter.ElapsedNanoseconds);
if (GraphicsConfig.FastGpuTime)
{
// Divide by some amount to report time as if operations were performed faster than they really are.
// This can prevent some games from switching to a lower resolution because rendering is too slow.
ticks /= 256;
}
ulong ticks = _context.GetTimestamp();
ICounterEvent counter = null;
@ -197,27 +185,5 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed
_channel.MemoryManager.CounterCache.AddOrUpdate(gpuVa, counter);
}
/// <summary>
/// Converts a nanoseconds timestamp value to Maxwell time ticks.
/// </summary>
/// <remarks>
/// The frequency is 614400000 Hz.
/// </remarks>
/// <param name="nanoseconds">Timestamp in nanoseconds</param>
/// <returns>Maxwell ticks</returns>
private static ulong ConvertNanosecondsToTicks(ulong nanoseconds)
{
// We need to divide first to avoid overflows.
// We fix up the result later by calculating the difference and adding
// that to the result.
ulong divided = nanoseconds / NsToTicksFractionDenominator;
ulong rounded = divided * NsToTicksFractionDenominator;
ulong errorBias = (nanoseconds - rounded) * NsToTicksFractionNumerator / NsToTicksFractionDenominator;
return divided * NsToTicksFractionNumerator + errorBias;
}
}
}

View File

@ -1,3 +1,4 @@
using Ryujinx.Common;
using Ryujinx.Graphics.GAL;
using Ryujinx.Graphics.Gpu.Engine.GPFifo;
using Ryujinx.Graphics.Gpu.Memory;
@ -15,6 +16,9 @@ namespace Ryujinx.Graphics.Gpu
/// </summary>
public sealed class GpuContext : IDisposable
{
private const int NsToTicksFractionNumerator = 384;
private const int NsToTicksFractionDenominator = 625;
/// <summary>
/// Event signaled when the host emulation context is ready to be used by the gpu context.
/// </summary>
@ -78,14 +82,27 @@ namespace Ryujinx.Graphics.Gpu
/// <summary>
/// Host hardware capabilities.
/// </summary>
internal Capabilities Capabilities => _caps.Value;
internal ref Capabilities Capabilities
{
get
{
if (!_capsLoaded)
{
_caps = Renderer.GetCapabilities();
_capsLoaded = true;
}
return ref _caps;
}
}
/// <summary>
/// Event for signalling shader cache loading progress.
/// </summary>
public event Action<ShaderCacheState, int, int> ShaderCacheStateChanged;
private readonly Lazy<Capabilities> _caps;
private bool _capsLoaded;
private Capabilities _caps;
private Thread _gpuThread;
/// <summary>
@ -110,8 +127,6 @@ namespace Ryujinx.Graphics.Gpu
DeferredActions = new Queue<Action>();
PhysicalMemoryRegistry = new ConcurrentDictionary<long, PhysicalMemory>();
_caps = new Lazy<Capabilities>(Renderer.GetCapabilities);
}
/// <summary>
@ -169,6 +184,46 @@ namespace Ryujinx.Graphics.Gpu
}
}
/// <summary>
/// Converts a nanoseconds timestamp value to Maxwell time ticks.
/// </summary>
/// <remarks>
/// The frequency is 614400000 Hz.
/// </remarks>
/// <param name="nanoseconds">Timestamp in nanoseconds</param>
/// <returns>Maxwell ticks</returns>
private static ulong ConvertNanosecondsToTicks(ulong nanoseconds)
{
// We need to divide first to avoid overflows.
// We fix up the result later by calculating the difference and adding
// that to the result.
ulong divided = nanoseconds / NsToTicksFractionDenominator;
ulong rounded = divided * NsToTicksFractionDenominator;
ulong errorBias = (nanoseconds - rounded) * NsToTicksFractionNumerator / NsToTicksFractionDenominator;
return divided * NsToTicksFractionNumerator + errorBias;
}
/// <summary>
/// Gets the value of the GPU timer.
/// </summary>
/// <returns>The current GPU timestamp</returns>
public ulong GetTimestamp()
{
ulong ticks = ConvertNanosecondsToTicks((ulong)PerformanceCounter.ElapsedNanoseconds);
if (GraphicsConfig.FastGpuTime)
{
// Divide by some amount to report time as if operations were performed faster than they really are.
// This can prevent some games from switching to a lower resolution because rendering is too slow.
ticks /= 256;
}
return ticks;
}
/// <summary>
/// Shader cache state update handler.
/// </summary>

View File

@ -834,13 +834,31 @@ namespace Ryujinx.Graphics.Gpu.Image
{
data = PixelConverter.ConvertR4G4ToR4G4B4A4(data);
}
else if (Target == Target.Texture3D && Format.IsBc4())
else if (!_context.Capabilities.Supports3DTextureCompression && Target == Target.Texture3D)
{
data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc4Snorm);
}
else if (Target == Target.Texture3D && Format.IsBc5())
{
data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc5Snorm);
switch (Format)
{
case Format.Bc1RgbaSrgb:
case Format.Bc1RgbaUnorm:
data = BCnDecoder.DecodeBC1(data, width, height, depth, levels, layers);
break;
case Format.Bc2Srgb:
case Format.Bc2Unorm:
data = BCnDecoder.DecodeBC2(data, width, height, depth, levels, layers);
break;
case Format.Bc3Srgb:
case Format.Bc3Unorm:
data = BCnDecoder.DecodeBC3(data, width, height, depth, levels, layers);
break;
case Format.Bc4Snorm:
case Format.Bc4Unorm:
data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Format == Format.Bc4Snorm);
break;
case Format.Bc5Snorm:
case Format.Bc5Unorm:
data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Format == Format.Bc5Snorm);
break;
}
}
return data;

View File

@ -49,6 +49,7 @@ namespace Ryujinx.Graphics.Gpu.Image
private readonly float[] _scales;
private bool _scaleChanged;
private int _lastFragmentTotal;
/// <summary>
/// Constructs a new instance of the texture bindings manager.
@ -288,26 +289,30 @@ namespace Ryujinx.Graphics.Gpu.Image
/// </summary>
private void CommitRenderScale()
{
// Stage 0 total: Compute or Vertex.
int total = _textureBindingsCount[0] + _imageBindingsCount[0];
int fragmentIndex = (int)ShaderStage.Fragment - 1;
int fragmentTotal = _isCompute ? 0 : (_textureBindingsCount[fragmentIndex] + _imageBindingsCount[fragmentIndex]);
if (total != 0 && fragmentTotal != _lastFragmentTotal)
{
// Must update scales in the support buffer if:
// - Vertex stage has bindings.
// - Fragment stage binding count has been updated since last render scale update.
_scaleChanged = true;
}
if (_scaleChanged)
{
int fragmentTotal = 0;
int total;
if (!_isCompute)
{
int fragmentIndex = (int)ShaderStage.Fragment - 1;
fragmentTotal = _textureBindingsCount[fragmentIndex] + _imageBindingsCount[fragmentIndex];
int vertexIndex = (int)ShaderStage.Vertex - 1;
int vertexTotal = _textureBindingsCount[vertexIndex] + _imageBindingsCount[vertexIndex];
total = fragmentTotal + vertexTotal;
}
else
{
total = _textureBindingsCount[0] + _imageBindingsCount[0];
total += fragmentTotal; // Add the fragment bindings to the total.
}
_lastFragmentTotal = fragmentTotal;
_context.Renderer.Pipeline.UpdateRenderScale(_scales, total, fragmentTotal);
_scaleChanged = false;

View File

@ -14,9 +14,6 @@ namespace Ryujinx.Graphics.Gpu.Image
private enum FormatClass
{
Unclassified,
BCn64,
BCn128,
Bc1Rgb,
Bc1Rgba,
Bc2,
Bc3,
@ -88,13 +85,21 @@ namespace Ryujinx.Graphics.Gpu.Image
return new FormatInfo(Format.R4G4B4A4Unorm, 1, 1, 2, 4);
}
if (info.Target == Target.Texture3D)
if (!caps.Supports3DTextureCompression && info.Target == Target.Texture3D)
{
// The host API does not support 3D BC4/BC5 compressed formats.
// The host API does not support 3D compressed formats.
// We assume software decompression will be done for those textures,
// and so we adjust the format here to match the decompressor output.
switch (info.FormatInfo.Format)
{
case Format.Bc1RgbaSrgb:
case Format.Bc2Srgb:
case Format.Bc3Srgb:
return new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4);
case Format.Bc1RgbaUnorm:
case Format.Bc2Unorm:
case Format.Bc3Unorm:
return new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4);
case Format.Bc4Unorm:
return new FormatInfo(Format.R8Unorm, 1, 1, 1, 1);
case Format.Bc4Snorm:
@ -749,9 +754,6 @@ namespace Ryujinx.Graphics.Gpu.Image
{
switch (format)
{
case Format.Bc1RgbSrgb:
case Format.Bc1RgbUnorm:
return FormatClass.Bc1Rgb;
case Format.Bc1RgbaSrgb:
case Format.Bc1RgbaUnorm:
return FormatClass.Bc1Rgba;

View File

@ -80,11 +80,9 @@ namespace Ryujinx.Graphics.OpenGL
Add(Format.R10G10B10A2Uint, new FormatInfo(4, false, false, All.Rgb10A2ui, PixelFormat.RgbaInteger, PixelType.UnsignedInt2101010Reversed));
Add(Format.R11G11B10Float, new FormatInfo(3, false, false, All.R11fG11fB10f, PixelFormat.Rgb, PixelType.UnsignedInt10F11F11FRev));
Add(Format.R9G9B9E5Float, new FormatInfo(3, false, false, All.Rgb9E5, PixelFormat.Rgb, PixelType.UnsignedInt5999Rev));
Add(Format.Bc1RgbUnorm, new FormatInfo(3, true, false, All.CompressedRgbS3tcDxt1Ext));
Add(Format.Bc1RgbaUnorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt1Ext));
Add(Format.Bc2Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt3Ext));
Add(Format.Bc3Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt5Ext));
Add(Format.Bc1RgbSrgb, new FormatInfo(3, false, false, All.CompressedSrgbS3tcDxt1Ext));
Add(Format.Bc1RgbaSrgb, new FormatInfo(4, true, false, All.CompressedSrgbAlphaS3tcDxt1Ext));
Add(Format.Bc2Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt3Ext));
Add(Format.Bc3Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt5Ext));

View File

@ -104,6 +104,7 @@ namespace Ryujinx.Graphics.OpenGL
hasFrontFacingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows,
hasVectorIndexingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows,
supportsAstcCompression: HwCapabilities.SupportsAstcCompression,
supports3DTextureCompression: false,
supportsBgraFormat: false,
supportsR4G4Format: false,
supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock,

View File

@ -1,7 +1,9 @@
using Ryujinx.Common;
using System;
using System.Runtime.CompilerServices;
using System.Buffers.Binary;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Ryujinx.Graphics.Texture
{
@ -10,22 +12,30 @@ namespace Ryujinx.Graphics.Texture
private const int BlockWidth = 4;
private const int BlockHeight = 4;
public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
public static byte[] DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
{
int size = 0;
for (int l = 0; l < levels; l++)
{
size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
}
byte[] output = new byte[size];
ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
Span<byte> rPal = stackalloc byte[8];
Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
int baseOOffs = 0;
Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
Span<Vector128<byte>> outputLine0 = default;
Span<Vector128<byte>> outputLine1 = default;
Span<Vector128<byte>> outputLine2 = default;
Span<Vector128<byte>> outputLine3 = default;
int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
@ -39,11 +49,302 @@ namespace Ryujinx.Graphics.Texture
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
int lineBaseOOffs = imageBaseOOffs + baseY * width;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
}
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
int lineBaseOOffs = baseOOffs + baseX;
int copyWidth = Math.Min(BlockWidth, width - baseX);
BC1DecodeTileRgb(tile, data);
if ((copyWidth | copyHeight) == 4)
{
outputLine0[x] = tileAsVector128[0];
outputLine1[x] = tileAsVector128[1];
outputLine2[x] = tileAsVector128[2];
outputLine3[x] = tileAsVector128[3];
}
else
{
int pixelBaseOOffs = lineBaseOOffs + baseX;
for (int tY = 0; tY < copyHeight; tY++)
{
tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
}
}
data = data.Slice(8);
}
}
imageBaseOOffs += width * height;
}
}
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
}
return output;
}
public static byte[] DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
{
int size = 0;
for (int l = 0; l < levels; l++)
{
size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
}
byte[] output = new byte[size];
Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
Span<Vector128<byte>> outputLine0 = default;
Span<Vector128<byte>> outputLine1 = default;
Span<Vector128<byte>> outputLine2 = default;
Span<Vector128<byte>> outputLine3 = default;
int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
int w = BitUtils.DivRoundUp(width, BlockWidth);
int h = BitUtils.DivRoundUp(height, BlockHeight);
for (int l2 = 0; l2 < layers; l2++)
{
for (int z = 0; z < depth; z++)
{
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
int lineBaseOOffs = imageBaseOOffs + baseY * width;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
}
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
int copyWidth = Math.Min(BlockWidth, width - baseX);
BC23DecodeTileRgb(tile, data.Slice(8));
ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
{
tile[i] = (byte)((block & 0xf) | (block << 4));
}
if ((copyWidth | copyHeight) == 4)
{
outputLine0[x] = tileAsVector128[0];
outputLine1[x] = tileAsVector128[1];
outputLine2[x] = tileAsVector128[2];
outputLine3[x] = tileAsVector128[3];
}
else
{
int pixelBaseOOffs = lineBaseOOffs + baseX;
for (int tY = 0; tY < copyHeight; tY++)
{
tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
}
}
data = data.Slice(16);
}
}
imageBaseOOffs += width * height;
}
}
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
}
return output;
}
public static byte[] DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
{
int size = 0;
for (int l = 0; l < levels; l++)
{
size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
}
byte[] output = new byte[size];
Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
Span<byte> rPal = stackalloc byte[8];
Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
Span<Vector128<byte>> outputLine0 = default;
Span<Vector128<byte>> outputLine1 = default;
Span<Vector128<byte>> outputLine2 = default;
Span<Vector128<byte>> outputLine3 = default;
int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
int w = BitUtils.DivRoundUp(width, BlockWidth);
int h = BitUtils.DivRoundUp(height, BlockHeight);
for (int l2 = 0; l2 < layers; l2++)
{
for (int z = 0; z < depth; z++)
{
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
int lineBaseOOffs = imageBaseOOffs + baseY * width;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
}
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
int copyWidth = Math.Min(BlockWidth, width - baseX);
BC23DecodeTileRgb(tile, data.Slice(8));
ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
rPal[0] = (byte)block;
rPal[1] = (byte)(block >> 8);
BCnLerpAlphaUnorm(rPal);
BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
if ((copyWidth | copyHeight) == 4)
{
outputLine0[x] = tileAsVector128[0];
outputLine1[x] = tileAsVector128[1];
outputLine2[x] = tileAsVector128[2];
outputLine3[x] = tileAsVector128[3];
}
else
{
int pixelBaseOOffs = lineBaseOOffs + baseX;
for (int tY = 0; tY < copyHeight; tY++)
{
tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
}
}
data = data.Slice(16);
}
}
imageBaseOOffs += width * height;
}
}
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
}
return output;
}
public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
{
int size = 0;
for (int l = 0; l < levels; l++)
{
size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
}
byte[] output = new byte[size];
Span<byte> outputSpan = new Span<byte>(output);
ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
Span<byte> rPal = stackalloc byte[8];
Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
Span<uint> outputLine0 = default;
Span<uint> outputLine1 = default;
Span<uint> outputLine2 = default;
Span<uint> outputLine3 = default;
int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
int w = BitUtils.DivRoundUp(width, BlockWidth);
int h = BitUtils.DivRoundUp(height, BlockHeight);
for (int l2 = 0; l2 < layers; l2++)
{
for (int z = 0; z < depth; z++)
{
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
int lineBaseOOffs = imageBaseOOffs + baseY * width;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs));
outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width));
outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 2));
outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + width * 3));
}
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
int copyWidth = Math.Min(BlockWidth, width - baseX);
ulong block = data64[0];
@ -52,45 +353,43 @@ namespace Ryujinx.Graphics.Texture
if (signed)
{
CalculateBC3AlphaS(rPal);
BCnLerpAlphaSnorm(rPal);
}
else
{
CalculateBC3Alpha(rPal);
BCnLerpAlphaUnorm(rPal);
}
ulong rI = block >> 16;
BCnDecodeTileAlpha(tile, rPal, block >> 16);
for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
if ((copyWidth | copyHeight) == 4)
{
int tX = texel & 3;
int tY = texel >> 2;
outputLine0[x] = tileAsUint[0];
outputLine1[x] = tileAsUint[1];
outputLine2[x] = tileAsUint[2];
outputLine3[x] = tileAsUint[3];
}
else
{
int pixelBaseOOffs = lineBaseOOffs + baseX;
if (baseX + tX >= width || baseY + tY >= height)
for (int tY = 0; tY < copyHeight; tY++)
{
continue;
tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + width * tY, copyWidth));
}
int shift = texel * 3;
byte r = rPal[(int)((rI >> shift) & 7)];
int oOffs = lineBaseOOffs + tY * width + tX;
output[oOffs] = r;
}
data64 = data64.Slice(1);
}
baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
}
imageBaseOOffs += width * height;
}
}
width = Math.Max(1, width >> 1);
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
depth = Math.Max(1, depth >> 1);
}
return output;
@ -109,10 +408,22 @@ namespace Ryujinx.Graphics.Texture
ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
Span<byte> rPal = stackalloc byte[8];
Span<byte> gPal = stackalloc byte[8];
int baseOOffs = 0;
Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output);
Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
Span<ulong> outputLine0 = default;
Span<ulong> outputLine1 = default;
Span<ulong> outputLine2 = default;
Span<ulong> outputLine3 = default;
int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
@ -126,11 +437,21 @@ namespace Ryujinx.Graphics.Texture
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
int copyHeight = Math.Min(BlockHeight, height - baseY);
int lineBaseOOffs = imageBaseOOffs + baseY * width;
if (copyHeight == 4)
{
outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs));
outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width));
outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 2));
outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + width * 3));
}
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
int lineBaseOOffs = baseOOffs + baseX;
int copyWidth = Math.Min(BlockWidth, width - baseX);
ulong blockL = data64[0];
ulong blockH = data64[1];
@ -142,101 +463,346 @@ namespace Ryujinx.Graphics.Texture
if (signed)
{
CalculateBC3AlphaS(rPal);
CalculateBC3AlphaS(gPal);
BCnLerpAlphaSnorm(rPal);
BCnLerpAlphaSnorm(gPal);
}
else
{
CalculateBC3Alpha(rPal);
CalculateBC3Alpha(gPal);
BCnLerpAlphaUnorm(rPal);
BCnLerpAlphaUnorm(gPal);
}
ulong rI = blockL >> 16;
ulong gI = blockH >> 16;
BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
if ((copyWidth | copyHeight) == 4)
{
int tX = texel & 3;
int tY = texel >> 2;
outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
}
else
{
int pixelBaseOOffs = lineBaseOOffs + baseX;
if (baseX + tX >= width || baseY + tY >= height)
for (int tY = 0; tY < copyHeight; tY++)
{
continue;
int line = pixelBaseOOffs + width * tY;
for (int tX = 0; tX < copyWidth; tX++)
{
int texel = tY * BlockWidth + tX;
outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
}
}
int shift = texel * 3;
byte r = rPal[(int)((rI >> shift) & 7)];
byte g = gPal[(int)((gI >> shift) & 7)];
int oOffs = (lineBaseOOffs + tY * width + tX) * 2;
output[oOffs + 0] = r;
output[oOffs + 1] = g;
}
data64 = data64.Slice(2);
}
baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
}
imageBaseOOffs += width * height;
}
}
width = Math.Max(1, width >> 1);
width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
depth = Math.Max(1, depth >> 1);
depth = Math.Max(1, depth >> 1);
}
return output;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void CalculateBC3Alpha(Span<byte> alpha)
private static ulong InterleaveBytes(uint left, uint right)
{
for (int i = 2; i < 8; i++)
return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
}
private static ulong InterleaveBytesWithZeros(uint value)
{
ulong output = value;
output = (output ^ (output << 16)) & 0xffff0000ffffUL;
output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
return output;
}
private static void BCnLerpAlphaUnorm(Span<byte> alpha)
{
byte a0 = alpha[0];
byte a1 = alpha[1];
if (a0 > a1)
{
if (alpha[0] > alpha[1])
alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
}
else
{
alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
alpha[6] = 0;
alpha[7] = 0xff;
}
}
private static void BCnLerpAlphaSnorm(Span<byte> alpha)
{
sbyte a0 = (sbyte)alpha[0];
sbyte a1 = (sbyte)alpha[1];
if (a0 > a1)
{
alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
}
else
{
alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
alpha[6] = 0x80;
alpha[7] = 0x7f;
}
}
private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
{
if (Avx2.IsSupported)
{
Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
Vector128<uint> masks = Vector128.Create(7u);
Vector128<byte> vClut;
fixed (byte* pRPal = rPal)
{
alpha[i] = (byte)(((8 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
}
else if (i < 6)
Vector128<uint> indices0 = Vector128.Create((uint)rI);
Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
indices00 = Sse2.And(indices00, masks);
indices10 = Sse2.And(indices10, masks);
indices01 = Sse2.And(indices01, masks);
indices11 = Sse2.And(indices11, masks);
Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
}
else
{
for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
{
alpha[i] = (byte)(((6 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
}
else if (i == 6)
{
alpha[i] = 0;
}
else /* i == 7 */
{
alpha[i] = 0xff;
output[i] = rPal[(int)(rI & 7)];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void CalculateBC3AlphaS(Span<byte> alpha)
private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
{
for (int i = 2; i < 8; i++)
if (Avx2.IsSupported)
{
if ((sbyte)alpha[0] > (sbyte)alpha[1])
Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
Vector128<uint> vClut128;
fixed (byte* pRPal = rPal)
{
alpha[i] = (byte)(((8 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
}
else if (i < 6)
Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
vClut = Avx2.ShiftLeftLogical(vClut, 24);
Vector256<uint> indices0 = Vector256.Create((uint)rI);
Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
}
else
{
for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
{
alpha[i] = (byte)(((6 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
}
else if (i == 6)
{
alpha[i] = 0x80;
}
else /* i == 7 */
{
alpha[i] = 0x7f;
output[i] = rPal[(int)(rI & 7)];
}
}
}
private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
{
Span<uint> clut = stackalloc uint[4];
uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
uint c0 = (ushort)c0c1;
uint c1 = (ushort)(c0c1 >> 16);
clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
BCnDecodeTileRgb(clut, output, input);
}
private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
{
Span<uint> clut = stackalloc uint[4];
uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
uint c0 = (ushort)c0c1;
uint c1 = (ushort)(c0c1 >> 16);
clut[0] = ConvertRgb565ToRgb888(c0);
clut[1] = ConvertRgb565ToRgb888(c1);
clut[2] = BC23LerpRgb2(clut[0], clut[1]);
clut[3] = BC23LerpRgb3(clut[0], clut[1]);
BCnDecodeTileRgb(clut, output, input);
}
private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
{
if (Avx2.IsSupported)
{
Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
Vector256<uint> masks = Vector256.Create(3u);
Vector256<uint> vClut;
fixed (uint* pClut = &clut[0])
{
vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
}
Vector256<uint> indices0;
fixed (byte* pInput = input)
{
indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
}
Vector256<uint> indices1 = indices0;
indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
indices0 = Avx2.And(indices0, masks);
indices1 = Avx2.And(indices1, masks);
outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
}
else
{
Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));
for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
{
outputAsUint[i] = clut[(int)(indices & 3)];
}
}
}
private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
{
if (c0 > c1)
{
return BC23LerpRgb2(color0, color1) | 0xff000000;
}
uint carry = color0 & color1;
uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
return (addHalve + carry) | 0xff000000;
}
private static uint BC23LerpRgb2(uint color0, uint color1)
{
uint r0 = (byte)color0;
uint g0 = color0 & 0xff00;
uint b0 = color0 & 0xff0000;
uint r1 = (byte)color1;
uint g1 = color1 & 0xff00;
uint b1 = color1 & 0xff0000;
uint mixR = (2 * r0 + r1) / 3;
uint mixG = (2 * g0 + g1) / 3;
uint mixB = (2 * b0 + b1) / 3;
return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
}
private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
{
if (c0 > c1)
{
return BC23LerpRgb3(color0, color1) | 0xff000000;
}
return 0;
}
private static uint BC23LerpRgb3(uint color0, uint color1)
{
uint r0 = (byte)color0;
uint g0 = color0 & 0xff00;
uint b0 = color0 & 0xff0000;
uint r1 = (byte)color1;
uint g1 = color1 & 0xff00;
uint b1 = color1 & 0xff0000;
uint mixR = (2 * r1 + r0) / 3;
uint mixG = (2 * g1 + g0) / 3;
uint mixB = (2 * b1 + b0) / 3;
return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
}
private static uint ConvertRgb565ToRgb888(uint value)
{
uint b = (value & 0x1f) << 19;
uint g = (value << 5) & 0xfc00;
uint r = (value >> 8) & 0xf8;
b |= b >> 5;
g |= g >> 6;
r |= r >> 5;
return r | (g & 0xff00) | (b & 0xff0000);
}
}
}

View File

@ -55,6 +55,8 @@ namespace Ryujinx.HLE.HOS.Services.Hid
// TODO: signal event at right place
_xpadIdEvent.ReadableEvent.Signal();
_vibrationPermitted = true;
}
[CommandHipc(0)]
@ -1141,8 +1143,6 @@ namespace Ryujinx.HLE.HOS.Services.Hid
{
context.ResponseData.Write(_vibrationPermitted);
Logger.Stub?.PrintStub(LogClass.ServiceHid, new { _vibrationPermitted });
return ResultCode.Success;
}

View File

@ -396,7 +396,7 @@ namespace Ryujinx.Modules
if (!OperatingSystem.IsWindows())
{
chmod(ryuBin, 0777);
chmod(ryuBin, 493);
}
}