The most efficient method at the moment is to connect System.Numerics.Vectors
from nuget, and vectorize your operations. At the same time you get them in a generic form. For System.Numerics.Vectors.Vector<T>
addition operation is defined, even if the structure T
itself is unknown!
Here you have a simplified class representing a vector of arbitrary length with the operation of addition:
class VectorNum<T> where T: struct { List<System.Numerics.Vector<T>> payload; int size; public VectorNum(int size) { var vectorSize = System.Numerics.Vector<T>.Count; var vectorCount = (size + vectorSize - 1) / vectorSize; payload = new List<System.Numerics.Vector<T>>(vectorCount); this.size = size; } public void AddTo(VectorNum<T> r) { List<System.Numerics.Vector<T>> ldata = payload, rdata = r.payload; if (size != r.size) throw new ArgumentException(); for (int i = 0; i < ldata.Count; i++) ldata[i] = ldata[i] + rdata[i]; } }
Benchmarks below.
A simpler (but less efficient) method is probably this:
static T add(T l, T r) { return (T)((dynamic)l + r); }
At the moment, it is impossible to explain to the compiler that the type of generic parameter is numeric. Worse, this is problematic because the same IL works for every generic type instantiation. And the addition of int
's and double
' s is represented on the IL-level by different instructions.
Performance measurement
Here is a test program:
static class Program { static void Main() { VectorExpr<double> ve1 = new VectorExpr<double>(2048), ve2 = new VectorExpr<double>(2048); VectorDyn<double> vd1 = new VectorDyn<double>(2048), vd2 = new VectorDyn<double>(2048); VectorNatDouble vn1 = new VectorNatDouble(2048), vn2 = new VectorNatDouble(2048); VectorNum<double> vv1 = new VectorNum<double>(2048), vv2 = new VectorNum<double>(2048); // прогрев ve1.Init(1.1, 2.2); ve2.Init(1.1, 2.2); vd1.Init(1.1, 2.2); vd2.Init(1.1, 2.2); vn1.Init(1.1, 2.2); vn2.Init(1.1, 2.2); vv1.Init(1.1, 2.2); vv2.Init(1.1, 2.2); ve1.AddTo(ve2); vd1.AddTo(vd2); vn1.AddTo(vn2); vv1.AddTo(vv2); Stopwatch swE = new Stopwatch(), swD = new Stopwatch(), swN = new Stopwatch(), swV = new Stopwatch(); swE.Start(); for (int i = 0; i < 1000 * 1000; i++) ve1.AddTo(ve2); swE.Stop(); var resultE = ve1.Sum(); swD.Start(); for (int i = 0; i < 1000 * 1000; i++) vd1.AddTo(vd2); swD.Stop(); var resultD = vd1.Sum(); swN.Start(); for (int i = 0; i < 1000 * 1000; i++) vn1.AddTo(vn2); swN.Stop(); var resultN = vn1.Sum(); swV.Start(); for (int i = 0; i < 1000 * 1000; i++) vv1.AddTo(vv2); swV.Stop(); var resultV = vv1.Sum(); Console.WriteLine($"Expression: elapsed time {swE.Elapsed}, result = {resultE}"); Console.WriteLine($"Dynamic : elapsed time {swD.Elapsed}, result = {resultD}"); Console.WriteLine($"Native : elapsed time {swN.Elapsed}, result = {resultN}"); Console.WriteLine($"Vector : elapsed time {swV.Elapsed}, result = {resultV}"); } } class VectorExpr<T> { static readonly Func<T, T, T> add; static VectorExpr() { var a = Expression.Parameter(typeof(T)); var b = Expression.Parameter(typeof(T)); add = Expression.Lambda<Func<T, T, T>>(Expression.Add(a, b), a, b).Compile(); } T[] payload; public VectorExpr(int size) { payload = new T[size]; } public void Init(T seed1, T seed2) { T curr = seed1; for (int i = 0; i < payload.Length; i++) { payload[i] = curr; curr = add(curr, seed2); } } public void AddTo(VectorExpr<T> r) { T[] ldata = payload, rdata = r.payload; if (ldata.Length != rdata.Length) throw new ArgumentException(); for (int i = 0; i < ldata.Length; i++) ldata[i] = add(ldata[i], rdata[i]); } public T Sum() => Enumerable.Aggregate(payload, add); } class VectorDyn<T> { static T add(T l, T r) => (T)((dynamic)l + r); T[] payload; public VectorDyn(int size) { payload = new T[size]; } public void Init(T seed1, T seed2) { T curr = seed1; for (int i = 0; i < payload.Length; i++) { payload[i] = curr; curr = add(curr, seed2); } } public void AddTo(VectorDyn<T> r) { T[] ldata = payload, rdata = r.payload; if (ldata.Length != rdata.Length) throw new ArgumentException(); for (int i = 0; i < ldata.Length; i++) ldata[i] = add(ldata[i], rdata[i]); } public T Sum() => Enumerable.Aggregate(payload, add); } class VectorNum<T> where T: struct { List<System.Numerics.Vector<T>> payload; int size; public VectorNum(int size) { var vectorSize = System.Numerics.Vector<T>.Count; var vectorCount = (size + vectorSize - 1) / vectorSize; payload = new List<System.Numerics.Vector<T>>(vectorCount); this.size = size; } public void Init(T seed1, T seed2) { var values = new T[System.Numerics.Vector<T>.Count]; T curr = seed1; for (int i = 0; i < size; i++) { int j = i % values.Length; values[j] = curr; curr = (T)((dynamic)curr + seed2); if (j == values.Length - 1 || i == size - 1) { payload.Add(new System.Numerics.Vector<T>(values)); Array.Clear(values, 0, values.Length); } } } public void AddTo(VectorNum<T> r) { List<System.Numerics.Vector<T>> ldata = payload, rdata = r.payload; if (size != r.size) throw new ArgumentException(); for (int i = 0; i < ldata.Count; i++) ldata[i] = ldata[i] + rdata[i]; } public T Sum() { var vectorSum = Enumerable.Aggregate(payload, (v1, v2) => v1 + v2); return Enumerable.Range(0, System.Numerics.Vector<T>.Count) .Select(idx => vectorSum[idx]) .Aggregate((l, r) => (T)((dynamic)l + r)); } } class VectorNatDouble { double[] payload; public VectorNatDouble(int size) { payload = new double[size]; } public void Init(double seed1, double seed2) { double curr = seed1; for (int i = 0; i < payload.Length; i++) { payload[i] = curr; curr = curr + seed2; } } public void AddTo(VectorNatDouble r) { double[] ldata = payload, rdata = r.payload; if (ldata.Length != rdata.Length) throw new ArgumentException(); for (int i = 0; i < ldata.Length; i++) ldata[i] = ldata[i] + rdata[i]; } public double Sum() => payload.Sum(); }
compares the dynamic approach, the Expression approach, the vectorization approach, and the native non-generic code.
The results (Release mode, outside of Visual Studio, x64 on 64-bit Windows) in five starts:
...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:03.2824695, result = 4613743627468,84 Dynamic : elapsed time 00:00:22.1788664, result = 4613743627468,84 Native : elapsed time 00:00:01.3229095, result = 4613743627468,84 Vector : elapsed time 00:00:01.0145604, result = 4613743627468,84 ...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:03.3579191, result = 4613743627468,84 Dynamic : elapsed time 00:00:22.6582909, result = 4613743627468,84 Native : elapsed time 00:00:01.3391487, result = 4613743627468,84 Vector : elapsed time 00:00:01.0315590, result = 4613743627468,84 ...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:03.3705103, result = 4613743627468,84 Dynamic : elapsed time 00:00:21.9322240, result = 4613743627468,84 Native : elapsed time 00:00:01.3294231, result = 4613743627468,84 Vector : elapsed time 00:00:01.0168434, result = 4613743627468,84 ...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:03.3426822, result = 4613743627468,84 Dynamic : elapsed time 00:00:22.5870899, result = 4613743627468,84 Native : elapsed time 00:00:01.3227761, result = 4613743627468,84 Vector : elapsed time 00:00:01.0202565, result = 4613743627468,84 ...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:03.3142676, result = 4613743627468,84 Dynamic : elapsed time 00:00:21.8089834, result = 4613743627468,84 Native : elapsed time 00:00:01.2854937, result = 4613743627468,84 Vector : elapsed time 00:00:00.9963579, result = 4613743627468,84
The average for the Expression is 3.3336 seconds, for the dynamic 22.2331 seconds, the native code is 1.3200 seconds, and the vectorized code is 1.0159 seconds. So, the vectorized version is even 30% faster than the native one.
The results for int
even better:
...\bin\x64\Release>Test.exe Expression: elapsed time 00:00:02.9075671, result = -1870659584 Dynamic : elapsed time 00:00:22.9417084, result = -1870659584 Native : elapsed time 00:00:01.5297815, result = -1870659584 Vector : elapsed time 00:00:00.5281395, result = -1870659584
Vectorization accelerates three times compared to the native code.