Implement speculative translation on the CPU (#515)

* Implement speculative translation on the cpu, and change the way how branches to unknown or untranslated addresses works

* Port t0opt changes and other cleanups

* Change namespace from translation related classes to ChocolArm64.Translation, other minor tweaks

* Fix typo

* Translate higher quality code for indirect jumps aswell, and on some cases that were missed when lower quality (tier 0) code was available

* Remove debug print

* Remove direct argument passing optimization, and enable tail calls for BR instructions

* Call delegates directly with Callvirt rather than calling Execute, do not emit calls for tier 0 code

* Remove unused property

* Rename argument on ArmSubroutine delegate
This commit is contained in:
gdkchan
2019-02-04 18:26:05 -03:00
committed by GitHub
parent f5b4f6ccc4
commit a694420d11
21 changed files with 656 additions and 376 deletions

View File

@ -11,6 +11,7 @@ namespace ChocolArm64.Translation
class ILEmitterCtx
{
private TranslatorCache _cache;
private TranslatorQueue _queue;
private Dictionary<long, ILLabel> _labels;
@ -23,6 +24,8 @@ namespace ChocolArm64.Translation
public Block CurrBlock => _currBlock;
public OpCode64 CurrOp => _currBlock?.OpCodes[_opcIndex];
public TranslationTier Tier { get; }
public Aarch32Mode Mode { get; } = Aarch32Mode.User; //TODO
private Dictionary<Block, ILBlock> _visitedBlocks;
@ -47,11 +50,14 @@ namespace ChocolArm64.Translation
private const int VecTmp1Index = -5;
private const int VecTmp2Index = -6;
public ILEmitterCtx(TranslatorCache cache, Block graph)
public ILEmitterCtx(TranslatorCache cache, TranslatorQueue queue, TranslationTier tier, Block graph)
{
_cache = cache ?? throw new ArgumentNullException(nameof(cache));
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
_currBlock = graph ?? throw new ArgumentNullException(nameof(graph));
Tier = tier;
_labels = new Dictionary<long, ILLabel>();
_visitedBlocks = new Dictionary<Block, ILBlock>();
@ -243,6 +249,16 @@ namespace ChocolArm64.Translation
return new ILBlock();
}
public void TranslateAhead(long position, ExecutionMode mode = ExecutionMode.Aarch64)
{
if (_cache.TryGetSubroutine(position, out TranslatedSub sub) && sub.Tier != TranslationTier.Tier0)
{
return;
}
_queue.Enqueue(new TranslatorQueueItem(position, mode, TranslationTier.Tier1));
}
public bool TryOptEmitSubroutineCall()
{
if (_currBlock.Next == null)
@ -265,20 +281,8 @@ namespace ChocolArm64.Translation
EmitLdarg(index);
}
foreach (Register reg in subroutine.SubArgs)
{
switch (reg.Type)
{
case RegisterType.Flag: Ldloc(reg.Index, IoType.Flag); break;
case RegisterType.Int: Ldloc(reg.Index, IoType.Int); break;
case RegisterType.Vector: Ldloc(reg.Index, IoType.Vector); break;
}
}
EmitCall(subroutine.Method);
subroutine.AddCaller(_subPosition);
return true;
}
@ -463,7 +467,12 @@ namespace ChocolArm64.Translation
_ilBlock.Add(new ILOpCodeBranch(ilOp, label));
}
public void Emit(string text)
public void EmitFieldLoad(FieldInfo info)
{
_ilBlock.Add(new ILOpCodeLoadField(info));
}
public void EmitPrint(string text)
{
_ilBlock.Add(new ILOpCodeLog(text));
}
@ -618,14 +627,9 @@ namespace ChocolArm64.Translation
EmitCall(objType.GetMethod(mthdName, BindingFlags.Instance | BindingFlags.NonPublic));
}
public void EmitCall(MethodInfo mthdInfo)
public void EmitCall(MethodInfo mthdInfo, bool isVirtual = false)
{
if (mthdInfo == null)
{
throw new ArgumentNullException(nameof(mthdInfo));
}
_ilBlock.Add(new ILOpCodeCall(mthdInfo));
_ilBlock.Add(new ILOpCodeCall(mthdInfo ?? throw new ArgumentNullException(nameof(mthdInfo)), isVirtual));
}
public void EmitLdc_I(long value)

View File

@ -26,74 +26,32 @@ namespace ChocolArm64.Translation
_subName = subName;
}
public TranslatedSub GetSubroutine()
public TranslatedSub GetSubroutine(TranslationTier tier)
{
LocalAlloc = new LocalAlloc(_ilBlocks, _ilBlocks[0]);
List<Register> subArgs = new List<Register>();
void SetArgs(long inputs, RegisterType baseType)
{
for (int bit = 0; bit < 64; bit++)
{
long mask = 1L << bit;
if ((inputs & mask) != 0)
{
subArgs.Add(GetRegFromBit(bit, baseType));
}
}
}
SetArgs(LocalAlloc.GetIntInputs(_ilBlocks[0]), RegisterType.Int);
SetArgs(LocalAlloc.GetVecInputs(_ilBlocks[0]), RegisterType.Vector);
DynamicMethod method = new DynamicMethod(_subName, typeof(long), GetArgumentTypes(subArgs));
DynamicMethod method = new DynamicMethod(_subName, typeof(long), TranslatedSub.FixedArgTypes);
Generator = method.GetILGenerator();
TranslatedSub subroutine = new TranslatedSub(method, subArgs);
int argsStart = TranslatedSub.FixedArgTypes.Length;
TranslatedSub subroutine = new TranslatedSub(method, tier);
_locals = new Dictionary<Register, int>();
_localsCount = 0;
for (int index = 0; index < subroutine.SubArgs.Count; index++)
{
Register reg = subroutine.SubArgs[index];
Generator.EmitLdarg(index + argsStart);
Generator.EmitStloc(GetLocalIndex(reg));
}
new ILOpCodeLoadState(_ilBlocks[0]).Emit(this);
foreach (ILBlock ilBlock in _ilBlocks)
{
ilBlock.Emit(this);
}
subroutine.PrepareMethod();
return subroutine;
}
private Type[] GetArgumentTypes(IList<Register> Params)
{
Type[] fixedArgs = TranslatedSub.FixedArgTypes;
Type[] output = new Type[Params.Count + fixedArgs.Length];
fixedArgs.CopyTo(output, 0);
int typeIdx = fixedArgs.Length;
for (int index = 0; index < Params.Count; index++)
{
output[typeIdx++] = GetFieldType(Params[index].Type);
}
return output;
}
public int GetLocalIndex(Register reg)
{
if (!_locals.TryGetValue(reg, out int index))

View File

@ -5,16 +5,19 @@ namespace ChocolArm64.Translation
{
struct ILOpCodeCall : IILEmit
{
private MethodInfo _mthdInfo;
public MethodInfo Info { get; private set; }
public ILOpCodeCall(MethodInfo mthdInfo)
public bool IsVirtual { get; private set; }
public ILOpCodeCall(MethodInfo info, bool isVirtual)
{
_mthdInfo = mthdInfo;
Info = info;
IsVirtual = isVirtual;
}
public void Emit(ILMethodBuilder context)
{
context.Generator.Emit(OpCodes.Call, _mthdInfo);
context.Generator.Emit(IsVirtual ? OpCodes.Callvirt : OpCodes.Call, Info);
}
}
}

View File

@ -0,0 +1,20 @@
using System.Reflection;
using System.Reflection.Emit;
namespace ChocolArm64.Translation
{
struct ILOpCodeLoadField : IILEmit
{
public FieldInfo Info { get; private set; }
public ILOpCodeLoadField(FieldInfo info)
{
Info = info;
}
public void Emit(ILMethodBuilder context)
{
context.Generator.Emit(OpCodes.Ldfld, Info);
}
}
}

View File

@ -0,0 +1,65 @@
using ChocolArm64.Memory;
using ChocolArm64.State;
using System;
using System.Reflection;
using System.Reflection.Emit;
namespace ChocolArm64.Translation
{
delegate long ArmSubroutine(CpuThreadState state, MemoryManager memory);
class TranslatedSub
{
public ArmSubroutine Delegate { get; private set; }
public static int StateArgIdx { get; private set; }
public static int MemoryArgIdx { get; private set; }
public static Type[] FixedArgTypes { get; private set; }
public DynamicMethod Method { get; private set; }
public TranslationTier Tier { get; private set; }
public TranslatedSub(DynamicMethod method, TranslationTier tier)
{
Method = method ?? throw new ArgumentNullException(nameof(method));;
Tier = tier;
}
static TranslatedSub()
{
MethodInfo mthdInfo = typeof(ArmSubroutine).GetMethod("Invoke");
ParameterInfo[] Params = mthdInfo.GetParameters();
FixedArgTypes = new Type[Params.Length];
for (int index = 0; index < Params.Length; index++)
{
Type argType = Params[index].ParameterType;
FixedArgTypes[index] = argType;
if (argType == typeof(CpuThreadState))
{
StateArgIdx = index;
}
else if (argType == typeof(MemoryManager))
{
MemoryArgIdx = index;
}
}
}
public void PrepareMethod()
{
Delegate = (ArmSubroutine)Method.CreateDelegate(typeof(ArmSubroutine));
}
public long Execute(CpuThreadState threadState, MemoryManager memory)
{
return Delegate(threadState, memory);
}
}
}

View File

@ -0,0 +1,11 @@
namespace ChocolArm64.Translation
{
enum TranslationTier
{
Tier0,
Tier1,
Tier2,
Count
}
}

View File

@ -0,0 +1,188 @@
using ChocolArm64.Decoders;
using ChocolArm64.Events;
using ChocolArm64.Memory;
using ChocolArm64.State;
using System;
using System.Threading;
namespace ChocolArm64.Translation
{
public class Translator
{
private MemoryManager _memory;
private CpuThreadState _dummyThreadState;
private TranslatorCache _cache;
private TranslatorQueue _queue;
private Thread _backgroundTranslator;
public event EventHandler<CpuTraceEventArgs> CpuTrace;
public bool EnableCpuTrace { get; set; }
private volatile int _threadCount;
public Translator(MemoryManager memory)
{
_memory = memory;
_dummyThreadState = new CpuThreadState();
_dummyThreadState.Running = false;
_cache = new TranslatorCache();
_queue = new TranslatorQueue();
}
internal void ExecuteSubroutine(CpuThread thread, long position)
{
if (Interlocked.Increment(ref _threadCount) == 1)
{
_backgroundTranslator = new Thread(TranslateQueuedSubs);
_backgroundTranslator.Start();
}
ExecuteSubroutine(thread.ThreadState, position);
if (Interlocked.Decrement(ref _threadCount) == 0)
{
_queue.ForceSignal();
}
}
private void ExecuteSubroutine(CpuThreadState state, long position)
{
state.CurrentTranslator = this;
do
{
if (EnableCpuTrace)
{
CpuTrace?.Invoke(this, new CpuTraceEventArgs(position));
}
TranslatedSub subroutine = GetOrTranslateSubroutine(state, position);
position = subroutine.Execute(state, _memory);
}
while (position != 0 && state.Running);
state.CurrentTranslator = null;
}
internal void TranslateVirtualSubroutine(CpuThreadState state, long position)
{
if (!_cache.TryGetSubroutine(position, out TranslatedSub sub) || sub.Tier == TranslationTier.Tier0)
{
_queue.Enqueue(new TranslatorQueueItem(position, state.GetExecutionMode(), TranslationTier.Tier1));
}
}
internal ArmSubroutine GetOrTranslateVirtualSubroutine(CpuThreadState state, long position)
{
if (!_cache.TryGetSubroutine(position, out TranslatedSub sub))
{
sub = TranslateLowCq(position, state.GetExecutionMode());
}
if (sub.Tier == TranslationTier.Tier0)
{
_queue.Enqueue(new TranslatorQueueItem(position, state.GetExecutionMode(), TranslationTier.Tier1));
}
return sub.Delegate;
}
internal TranslatedSub GetOrTranslateSubroutine(CpuThreadState state, long position)
{
if (!_cache.TryGetSubroutine(position, out TranslatedSub subroutine))
{
subroutine = TranslateLowCq(position, state.GetExecutionMode());
}
return subroutine;
}
private void TranslateQueuedSubs()
{
while (_threadCount != 0)
{
if (_queue.TryDequeue(out TranslatorQueueItem item))
{
bool isCached = _cache.TryGetSubroutine(item.Position, out TranslatedSub sub);
if (isCached && item.Tier <= sub.Tier)
{
continue;
}
if (item.Tier == TranslationTier.Tier0)
{
TranslateLowCq(item.Position, item.Mode);
}
else
{
TranslateHighCq(item.Position, item.Mode);
}
}
else
{
_queue.WaitForItems();
}
}
}
private TranslatedSub TranslateLowCq(long position, ExecutionMode mode)
{
Block block = Decoder.DecodeBasicBlock(_memory, position, mode);
ILEmitterCtx context = new ILEmitterCtx(_cache, _queue, TranslationTier.Tier0, block);
string subName = GetSubroutineName(position);
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(context.GetILBlocks(), subName);
TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier0);
return _cache.GetOrAdd(position, subroutine, block.OpCodes.Count);
}
private void TranslateHighCq(long position, ExecutionMode mode)
{
Block graph = Decoder.DecodeSubroutine(_memory, position, mode);
ILEmitterCtx context = new ILEmitterCtx(_cache, _queue, TranslationTier.Tier1, graph);
ILBlock[] ilBlocks = context.GetILBlocks();
string subName = GetSubroutineName(position);
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(ilBlocks, subName);
TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier1);
int ilOpCount = 0;
foreach (ILBlock ilBlock in ilBlocks)
{
ilOpCount += ilBlock.Count;
}
_cache.AddOrUpdate(position, subroutine, ilOpCount);
ForceAheadOfTimeCompilation(subroutine);
}
private string GetSubroutineName(long position)
{
return $"Sub{position:x16}";
}
private void ForceAheadOfTimeCompilation(TranslatedSub subroutine)
{
subroutine.Execute(_dummyThreadState, null);
}
}
}

View File

@ -0,0 +1,196 @@
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading;
namespace ChocolArm64.Translation
{
class TranslatorCache
{
//Maximum size of the cache, the unit used is completely arbitrary.
private const int MaxTotalSize = 0x800000;
//Minimum time required in milliseconds for a method to be eligible for deletion.
private const int MinTimeDelta = 2 * 60000;
//Minimum number of calls required to update the timestamp.
private const int MinCallCountForUpdate = 250;
private class CacheBucket
{
public TranslatedSub Subroutine { get; private set; }
public LinkedListNode<long> Node { get; private set; }
public int CallCount { get; set; }
public int Size { get; private set; }
public long Timestamp { get; private set; }
public CacheBucket(TranslatedSub subroutine, LinkedListNode<long> node, int size)
{
Subroutine = subroutine;
Size = size;
UpdateNode(node);
}
public void UpdateNode(LinkedListNode<long> node)
{
Node = node;
Timestamp = GetTimestamp();
}
}
private ConcurrentDictionary<long, CacheBucket> _cache;
private LinkedList<long> _sortedCache;
private int _totalSize;
public TranslatorCache()
{
_cache = new ConcurrentDictionary<long, CacheBucket>();
_sortedCache = new LinkedList<long>();
}
public TranslatedSub GetOrAdd(long position, TranslatedSub subroutine, int size)
{
ClearCacheIfNeeded();
lock (_sortedCache)
{
LinkedListNode<long> node = _sortedCache.AddLast(position);
CacheBucket bucket = new CacheBucket(subroutine, node, size);
bucket = _cache.GetOrAdd(position, bucket);
if (bucket.Node == node)
{
_totalSize += size;
}
else
{
_sortedCache.Remove(node);
}
return bucket.Subroutine;
}
}
public void AddOrUpdate(long position, TranslatedSub subroutine, int size)
{
ClearCacheIfNeeded();
lock (_sortedCache)
{
_totalSize += size;
LinkedListNode<long> node = _sortedCache.AddLast(position);
CacheBucket newBucket = new CacheBucket(subroutine, node, size);
_cache.AddOrUpdate(position, newBucket, (key, bucket) =>
{
_totalSize -= bucket.Size;
_sortedCache.Remove(bucket.Node);
return newBucket;
});
}
}
public bool HasSubroutine(long position)
{
return _cache.ContainsKey(position);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryGetSubroutine(long position, out TranslatedSub subroutine)
{
if (_cache.TryGetValue(position, out CacheBucket bucket))
{
if (bucket.CallCount++ > MinCallCountForUpdate)
{
if (Monitor.TryEnter(_sortedCache))
{
try
{
//The bucket value on the dictionary may have changed between the
//time we get the value from the dictionary, and we acquire the
//lock. So we need to ensure we are working with the latest value,
//we can do that by getting the value again, inside the lock.
if (_cache.TryGetValue(position, out CacheBucket latestBucket))
{
latestBucket.CallCount = 0;
_sortedCache.Remove(latestBucket.Node);
latestBucket.UpdateNode(_sortedCache.AddLast(position));
}
}
finally
{
Monitor.Exit(_sortedCache);
}
}
}
subroutine = bucket.Subroutine;
return true;
}
subroutine = default(TranslatedSub);
return false;
}
private void ClearCacheIfNeeded()
{
long timestamp = GetTimestamp();
while (_totalSize > MaxTotalSize)
{
lock (_sortedCache)
{
LinkedListNode<long> node = _sortedCache.First;
if (node == null)
{
break;
}
CacheBucket bucket = _cache[node.Value];
long timeDelta = timestamp - bucket.Timestamp;
if (timeDelta <= MinTimeDelta)
{
break;
}
if (_cache.TryRemove(node.Value, out bucket))
{
_totalSize -= bucket.Size;
_sortedCache.Remove(bucket.Node);
}
}
}
}
private static long GetTimestamp()
{
long timestamp = Stopwatch.GetTimestamp();
return timestamp / (Stopwatch.Frequency / 1000);
}
}
}

View File

@ -0,0 +1,83 @@
using System.Collections.Concurrent;
using System.Threading;
namespace ChocolArm64.Translation
{
class TranslatorQueue
{
//This is the maximum number of functions to be translated that the queue can hold.
//The value may need some tuning to find the sweet spot.
private const int MaxQueueSize = 1024;
private ConcurrentStack<TranslatorQueueItem>[] _translationQueue;
private ManualResetEvent _queueDataReceivedEvent;
private bool _signaled;
public TranslatorQueue()
{
_translationQueue = new ConcurrentStack<TranslatorQueueItem>[(int)TranslationTier.Count];
for (int prio = 0; prio < _translationQueue.Length; prio++)
{
_translationQueue[prio] = new ConcurrentStack<TranslatorQueueItem>();
}
_queueDataReceivedEvent = new ManualResetEvent(false);
}
public void Enqueue(TranslatorQueueItem item)
{
ConcurrentStack<TranslatorQueueItem> queue = _translationQueue[(int)item.Tier];
if (queue.Count >= MaxQueueSize)
{
queue.TryPop(out _);
}
queue.Push(item);
_queueDataReceivedEvent.Set();
}
public bool TryDequeue(out TranslatorQueueItem item)
{
for (int prio = 0; prio < _translationQueue.Length; prio++)
{
if (_translationQueue[prio].TryPop(out item))
{
return true;
}
}
item = default(TranslatorQueueItem);
return false;
}
public void WaitForItems()
{
_queueDataReceivedEvent.WaitOne();
lock (_queueDataReceivedEvent)
{
if (!_signaled)
{
_queueDataReceivedEvent.Reset();
}
}
}
public void ForceSignal()
{
lock (_queueDataReceivedEvent)
{
_signaled = true;
_queueDataReceivedEvent.Set();
_queueDataReceivedEvent.Close();
}
}
}
}

View File

@ -0,0 +1,20 @@
using ChocolArm64.State;
namespace ChocolArm64.Translation
{
struct TranslatorQueueItem
{
public long Position { get; }
public ExecutionMode Mode { get; }
public TranslationTier Tier { get; }
public TranslatorQueueItem(long position, ExecutionMode mode, TranslationTier tier)
{
Position = position;
Mode = mode;
Tier = tier;
}
}
}