SIMD

SIMD:single instruction multiply data
單指令多數(shù)據(jù),多用于矢量運算中,可以加速指令運算,比如矩陣乘

SIMD的思想在不同平臺架構(gòu)下進行了實現(xiàn)

  • X86:SSE指令
  • ARM:NEON指令
  • MIPS:MSA

不同平臺架構(gòu)下為實現(xiàn)SIMD思想也添加了不同的硬件支持(專用寄存器):

  • X86:目前支持128bit寄存器,256bit,甚至是512bit
  • ARM:128bit
  • MIPS:128bit

為何SIMD可以更加快速的進行矩陣運算
一般流程:每次從內(nèi)存加載一個數(shù)據(jù)寄存器進行運算
SIMD:一次加載多個數(shù)據(jù)到專用寄存器,則一條指令就能進行多個數(shù)據(jù)的加乘運算

SSE指令下:矩陣運算主要流程如下:
SIMD.png

簡單來說就是:加載 -> 運算 -> 存儲

  1. 加載:將數(shù)據(jù)從內(nèi)存加載到專用寄存器,一次可以加載多個數(shù)據(jù)
  2. 運算:使用SSE乘/加命令講點積和累加
  3. 存儲:將結(jié)果保存到內(nèi)存中
    具體命令可參考Intel SSE指令網(wǎng)址

MSA與NEON指令類似。
在矩陣運算中,SIMD優(yōu)化視具體情況而定,除了SIMD,還可進行循環(huán)展開,緩存控制(將數(shù)據(jù)拆分至緩存可以存儲大小,方便數(shù)據(jù)讀取),通用寄存器使用(頻繁使用的數(shù)據(jù)直接放至寄存器,C++使用register關(guān)鍵字)

具體代碼參考:
Matrix.h

#include <iostream>
#include <cstdlib>
#include <stdio.h>
#include <assert.h>
using namespace std;



template <typename T>   //T is char, short or int;
class Matrix{

public:
    T **_Matrix;
    size_t _Row, _Column;
    Matrix() :_Matrix(nullptr), _Row(0), _Column(0) {}
    Matrix(size_t r, size_t c) :_Row(r), _Column(c) {
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            *(p++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
        } while (p != end);
    }
    Matrix(size_t r, size_t c, const T init) :_Row(r), _Column(c) {//use init to initialize the matrix
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pr = _Matrix, **endr = _Matrix + _Row, *p, *end;
        do {
            p = *(pr++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            end = p + _Column;
            do {
                *(p++) = init;
            } while (p != end);
        } while (pr != endr);
    }
    Matrix(const Matrix& B) {//拷貝構(gòu)造
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pbr = B._Matrix, **endbr = B._Matrix + _Row, *pb, *endb,
            **par = _Matrix, **endar = _Matrix + _Row, *pa, *enda;
        do {
            pa = *(par++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            enda = pa + _Column;
            pb = *(pbr++);
            endb = pb + _Column;
            do {
                *(pa++) = *(pb++);
            } while (pa != enda);
        } while (par != endar);
    }
    ~Matrix() {//析構(gòu)
        if (!_Matrix || !_Column || !_Row) return;
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            _mm_free(*(p++));
        } while (p != end);
        _Column = _Row = 0;
        _mm_free(_Matrix);
    }
    T& operator()(size_t i, size_t j) { return _Matrix[i][j]; }//get the  elements of row i and column j

    const T operator()(size_t i, size_t j)const { return _Matrix[i][j]; }//get the  elements of row i and column j

    Matrix& operator=(Matrix& B) {//
        if (_Matrix && _Row && _Column) {
            T **p = _Matrix, **end = _Matrix + _Row;
            do {
                _mm_free(*(p++));
            } while (p != end);
            _mm_free(_Matrix);
        }
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = B._Matrix;
        B._Matrix = nullptr;
        return *this;
    }

    //transpose the matrix
    Matrix Tranpose() const{
        Matrix temp(_Column, _Row);
        for (int i = 0; i < _Row; i++) {
            for (int j = 0; j < _Column; j++) {
                temp(j, i) = (*this)(i, j);
            }
        }
        return temp;
    }

    //alignas
    Matrix _alignas(int align_row, int align_col) {
        int rest_row = align_row - _Row % align_row, rest_col = align_col - _Column % align_col;
        if (rest_row || rest_col) {
            int row = _Row + rest_row, col = _Column + rest_col;
            Matrix temp(row, col, 0);
            T **pbr = _Matrix, **endbr = _Matrix + _Row, *pb, *endb,
                **par = temp._Matrix, *pa;
            do {
                pa = *(par++);
                pb = *(pbr++);
                endb = pb + _Column;
                do {
                    *(pa++) = *(pb++);
                } while (pb != endb);
            } while (pbr != endbr);
            return temp;
        }
        return *this;
    }
    void print() {
        cout << "the matrix is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        int row_min = 10 <_Row ? 10 : _Row;
        for (short i = 0; i < row_min; i++) {
            for (short j = 0; j < column_min; j++) {
                cout << (short)_Matrix[i][j] << "  ";
            }
            cout << endl;
        }
    }
};

Vector.h

#pragma once
#include <iostream>
#include <cstdlib>
#include <stdio.h>
#include <assert.h>
using namespace std;



template <typename T>   //T is char, short or int;
class Vector {

public:
    T *_Vector;
    size_t _Column;
    Vector() :_Vector(NULL),  _Column(0) {}
    Vector(size_t c) : _Column(c) {
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);

    }
    Vector(size_t c, const T init) : _Column(c) {//use init to initialize the Vector
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);
        T *pr = _Vector, *endr = _Vector + _Column, *p, *end;
        do {
            *(pr++) = init; 
        } while (pr != endr);
    }
    Vector(const Vector& B) {//拷貝構(gòu)造
        _Column = B._Column;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T*), 16);
        T *pbr = B._Vector, *endbr = B._Vector + _Column, *par = _Vector;
        do {
            *(par++) = *(pbr++);
        } while (pbr != endbr);
    }
    ~Vector() {//析構(gòu)
        if (!_Vector) return;
        T *p = _Vector;
        _mm_free(_Vector);
    }
    T& operator()(size_t i) { return _Vector[i]; }//get the  elements of row i and column j

    const T operator()(size_t i)const { return _Vector[i]; }//get the  elements of row i and column j

    
    Vector& operator=(Vector& B) {//
        if (_Column && _Vector != NULL) {
            _mm_free(_Vector);
        }
        _Column = B._Column;
        _Vector = B._Vector;
        B._Vector = nullptr;
        return *this;
    }
    

    //alignas(16)
    Vector _alignas(int align_size) {
        int rest = align_size - _Column % align_size;
        if (rest) {
            int col = _Column + rest;
            Vector temp(col, 0);
            T *p = _Vector, *end = _Vector + _Column, *t = temp._Vector;
            do {
                *(t++) = *(p++);
            } while (p != end);
            return temp;
        }
        return *this;
    }
    //print
    void print() {
        cout << "the Vector Col is:  " <<_Column << "\nThe Vector is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        for (int i = 0; i < column_min; i++) {
            cout << (short)_Vector[i] << "  ";
        }
        cout << endl;
    }

};


Operator.h

#pragma once
#include <iostream>
#include <cstdlib>
#include <assert.h>
#include "Matrix.h"
#include "Vector.h"
#include <emmintrin.h>                 
#include <immintrin.h> 
using namespace std;

///Vec*Matrix
template <class T>
Vector<T> Vec_Mul_Mat_C(const Vector<T> &V, const Matrix<T> &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }
    Vector<T> ret(A._Column, 0);
    int sum = 0;
    for (int i = 0; i < A._Column; ++i) {
        for (int j = 0; j < V._Column; ++j) {
            ret(i) += V(j) * A(j, i);
        }
    }
    return ret;
}


template <class T>
Vector<T> Vec_Mul_Mat_SSE(const Vector<T> &V, const Matrix<T> &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }

    Vector<T> ret(A._Column, 0);
    Matrix<T> C = A.Tranpose();
    
    if (sizeof(T) == 1) {
        __m128i X, Y;
        __m128i acc;
        alignas(16) short temp[8];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 16 <= V._Column; j += 16) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
        }
    }
    else if (sizeof(T) == 2) {
        Vector<T> ret(A._Column, 0);
        __m128i X, Y;
        __m128i acc;
        alignas(16) int temp[4];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 8 <= V._Column; j += 8) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3];
        }
    }
    return ret;
}



//matrix *matrix C
template <class T>
Matrix<T> Matrix_Mul_C(const Matrix<T> &A, const Matrix<T> &B) {

    Matrix<T> retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }
    
    for (short i = 0; i < A._Row; i++) {
        for (short j = 0; j < B._Column; j++) {
            for (short k = 0; k < A._Column; k++) {
                retC(i, j) += A(i, k)*B(k, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 1*8
template <class T>
Matrix<T> Matrix_Mul_C_(const Matrix<T> &A, const Matrix<T> &B) {
    
    Matrix<T> retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }

    
    int i, j, k;
    for (i = 0; i < A._Row; ++i) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                retC(i, j) += A(i, k)*B(k, j) + A(i, k + 1)*B(k + 1, j) + A(i, k + 2)*B(k + 2, j) + A(i, k + 3)*B(k + 3, j)
                    + A(i, k + 4)*B(k + 4, j) + A(i, k + 5)*B(k + 5, j) + A(i, k + 6)*B(k + 6, j) + A(i, k + 7)*B(k + 7, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 8*8
template <class T>
Matrix<T> Matrix_Mul_CC_(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> retC(A._Row, B._Column, 0);
    int i, j, k;
    for (i = 0; i + 8 <= A._Row; i += 8) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                for (int t = 0; t < 8; t++) {
                    retC(i + t, j) += A(i + t, k)*B(k, j) + A(i + t, k + 1)*B(k + 1, j) + A(i + t, k + 2)*B(k + 2, j) + A(i + t, k + 3)*B(k + 3, j)
                        + A(i + t, k + 4)*B(k + 4, j) + A(i + t, k + 5)*B(k + 5, j) + A(i + t, k + 6)*B(k + 6, j) + A(i + t, k + 7)*B(k + 7, j);
                }
            }
        }
    }
    return retC;
}



template<class T>
Matrix<T> Matrix_Mul_SSE(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> retSSE(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retSSE;
    }
    Matrix<T> C = B.Tranpose();
    __m128i X, Y;
    __m128i acc = _mm_setzero_si128();

    int i, j, k;
    if(sizeof(T) == 1) {
        alignas(16) short temp[8];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 16 <= A._Column; k += 16) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
            }
        }
    }
    else if(sizeof(T) == 2) {
        alignas(16) int temp[4];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 8 <= A._Column; k += 8) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3];
                //retC(i, j) = inner_prod;
            }
        }
    }   
    return retSSE;
}


///compare the result of C and SSE
template <class T>
bool C_Compare_SSE(Matrix<T> &C_Matrix, Matrix<T> SSE_Matrix) {
    int i(C_Matrix._Row);
    while (--i >= 0) {
        if (memcmp(C_Matrix._Matrix[i], SSE_Matrix._Matrix[i], C_Matrix._Column * sizeof(T)) != 0) {
            return false;
        }
    }
    return true;
}

main.c

#include <stdio.h>
#include "Operate.h"
#include <cstdlib>
#include <time.h>

//Vec * Matrix
template <class T>
void Vec_Mat() {
    int N = 251;
    Matrix<T> A(10 * N, N*3, 2);
    double start, dt;
    A(3, 1) = 3;
    Vector<T> C(10 * N, 2);
    Vector<T> C1 = C._alignas(16);
    Matrix<T> C_Trans = A._alignas(16, 16); 

    cout << "\nThe time of C Vec * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Vector<T> ret, retSSE;
    while (cir-- > 0) {
        Vec_Mul_Mat_C(C1, C_Trans);
    }
    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Vec_Mul_Mat_C(C1, C_Trans);
    //ret.print();


    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Vec_Mul_Mat_SSE(C1, C_Trans);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Vec_Mul_Mat_SSE(C1, C_Trans);
    //retSSE.print();

    if (memcmp(ret._Vector, retSSE._Vector, ret._Column * sizeof(char)) == 0) {
        cout << "C and SSE have the same result " << endl;
    }
}

template <class T>
void Mat_Mat() {
    int N = 254;
    Matrix<T> A(N, N, 2), B(N, N, 1);
    A(3, 1) = 3, B(1, 0) = 4; B(2, 0) = 3;
    Matrix<T> A_align = A._alignas(16, 16);
    Matrix<T> B_align = B._alignas(16, 16);
    double start, dt;

    cout << "\nThe time of C Matrix * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Matrix<T> ret, retSSE;
    while (cir-- > 0) {
        Matrix_Mul_C(A_align, B_align);
    }
    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Matrix_Mul_C(A_align, B_align);
    //ret.print();
    

    /*
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_C_(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_CC_(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

*/

    cout << "\nThe time of SSE Matrix * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE(A_align, B_align);
    retSSE.print();


    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    retSSE.print();

    /*
    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 10;
    while (cir-- > 0) {
        retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 10 << "s" << endl;
    retSSE.print();
    */
    if (C_Compare_SSE(ret, retSSE)) {
        cout << "C and SSE have the same result " << endl;
    }
    
}


void main(short argc, char* argv[]) {

    //Vec_Mat<short>();

    Mat_Mat<short>();
    cout << "hello" << endl;
    return;
}

其中,Operator.h主要是進行矩陣運算,為了方便進行char和short類型的數(shù)據(jù)運算,使用了模板,初次使用,莫怪
本文思想[參考網(wǎng)址]。(https://blog.csdn.net/artorias123/article/details/86527456)

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

友情鏈接更多精彩內(nèi)容