// vector.cc // compile with g++ vector.cc -o vector -O2 -msse #include #include "xmmintrin.h" // /usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h #define PRINT(var) print((var), #var) using namespace std; typedef float v4sf __attribute__ ((vector_size (16))); void print(const v4sf& vec, const char* name); void print(const float* vec, const char* name); int main() { // one way to initialie a vector v4sf vecA = {0, 1, 2, 3}; float arrayA[] = {4, 2, 3, 6}; // another initialization technique: loading the data from a 16-byte aligned array v4sf vecAA = _mm_load_ps(arrayA); // one way of doing vector addition. The other method explicitly uses _mm_add_ps() v4sf vecB = vecA + vecA; float arrayB[4]; // store vector data in an array _mm_store_ps (arrayB, vecB); // multiplication by a scalar v4sf vecC = _mm_set1_ps(2.0) * vecB; v4sf vecD = vecC - _mm_set1_ps(0.5) * vecA; v4sf vecE = _mm_set1_ps(1.0) + _mm_set1_ps(5.0) * vecC / (vecA + _mm_set1_ps(3.0)); // for logic operations, TRUE = 0xFFFFFFFF, FALSE = 0x00000000 // Thus, the result is a mask which can be bitwise ANDed using _mm_and_ps() v4sf mask = _mm_cmpeq_ps(vecA, vecB); // vecF[i] = (vecA[i] == vecB[i]) ? vecE[i] : 0 v4sf vecF = _mm_and_ps(mask, vecE); // vecF[i] = (vecA[i] == vecB[i]) ? vecE[i] : vecD[i]; v4sf vecG = _mm_or_ps(_mm_and_ps(mask, vecE), _mm_andnot_ps(mask, vecD) ); // vecH[i] = sqrt(vecE[i]); v4sf vecH = _mm_sqrt_ps(vecE); // vecI[i] approx 1.0 / sqrt(vecE[i]); v4sf vecI = _mm_rsqrt_ps(vecE); // vecJ[i] approx 1.0 / vecAA[i]; v4sf vecJ = _mm_rcp_ps(vecAA); v4sf vecK = _mm_set1_ps(1.0) / vecAA; v4sf vecL = _mm_shuffle_ps(vecA, vecA, _MM_SHUFFLE(1,0,3,2) ); ////////////////////////////// output /////////////////////////////// PRINT(vecA); PRINT(vecAA); PRINT(vecB); PRINT(arrayB); PRINT(vecC); PRINT(vecD); PRINT(vecE); PRINT(vecF); PRINT(vecG); PRINT(vecH); PRINT(vecI); PRINT(vecJ); PRINT(vecK); PRINT(vecL); return 0; } void print(const v4sf& vec, const char* name) { union { v4sf vector; float array[4]; } value; value.vector = vec; print(value.array, name); } void print(const float* vec, const char* name) { cout << "\n" << name << endl; for (int i = 0; i < 4; ++i) { cout << "\t" << vec[i] << endl; } }