SSE4 1和SSE4 2 Intrinsics各函数介绍

SIMD相关头文件包括:

//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)

#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)

mmintrin.h为MMX 头文件,其中__m64的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
    double              m128d_f64[2];
} __m128d;

smmintrin.h为SSE4.1头文件,其文件中各函数的介绍:

	/*Integer blend instructions - select data from 2 sources
	using constant/variable mask*/
	//v1=(v10, v11, ..., v17), v2=(v20, v21, ..., v27)
	//mask:If the corresponding flag bit is 0, the value is selected from parameter v1.
	//Otherwise the value is from parameter v2.
	//则r0=(mask0 == 0) ? v10 : v20,...,r7= (mask7 == 0) ? v17 : v27
	extern __m128i _mm_blend_epi16 (__m128i v1, __m128i v2, const int mask);
	//v1=(v10, v11, ..., v115), v2=(v20, v21, ..., v215), mask=(mask1, ..., mask15)
	//则r0=(mask0 & 0x80) ? v20 : v10, ..., r15=(mask15 & 0x80) ? v215 : v115
	extern __m128i _mm_blendv_epi8 (__m128i v1, __m128i v2, __m128i mask);

	/*Float single precision blend instructions - select data
	from 2 sources using constant/variable mask */
	//v1=(v10, v11, v12, v13), v2=(v20, v21, v22, v23)
	//则r0=(mask0 == 0) ? v10 : v20,..., r3= (mask3 == 0) ? v13 : v23
	extern __m128  _mm_blend_ps (__m128  v1, __m128  v2, const int mask);
	//v1=(v10, v11, v12, v13), v2=(v20, v21, v22, v23)
	//则r0= (v30 & 0x80000000) ? v20 : v10,...,r3= (v33 & 0x80000000) ? v23 : v13
	extern __m128  _mm_blendv_ps(__m128  v1, __m128  v2, __m128 v3);

	/*Float double precision blend instructions - select data
	from 2 sources using constant/variable mask*/
	//v1=(v10, v11), v2=(v20, v21)
	//则r0 = (mask0 == 0) ? v10 : v20, r1 = (mask1 == 0) ? v11 : v21
	extern __m128d _mm_blend_pd (__m128d v1, __m128d v2, const int mask);
	//v1=(v10, v11), v2=(v20, v21)
	//则r0 = (v30 & 0x8000000000000000) ? v20 : v10,
	//r1 = (v31 & 0x8000000000000000) ? v21 : v11
	extern __m128d _mm_blendv_pd(__m128d v1, __m128d v2, __m128d v3);

	/*Dot product instructions with mask-defined summing and zeroing
	of result‘s parts*/
	//val1=(val10, ..., val13), val2=(val20,...,val23)
	/*则tmp0 := (mask4 == 1) ? (val10 * val20) : +0.0
		tmp1 := (mask5 == 1) ? (val11 * val21) : +0.0
		tmp2 := (mask6 == 1) ? (val12 * val22) : +0.0
		tmp3 := (mask7 == 1) ? (val13 * val23) : +0.0
		tmp4 := tmp0 + tmp1 + tmp2 + tmp3
		r0 := (mask0 == 1) ? tmp4 : +0.0
		r1 := (mask1 == 1) ? tmp4 : +0.0
		r2 := (mask2 == 1) ? tmp4 : +0.0
		r3 := (mask3 == 1) ? tmp4 : +0.0 */
	extern __m128  _mm_dp_ps(__m128  val1, __m128  val2, const int mask);
	//val1=(val10, val11), val2=(val20, val21)
	/*则tmp0 := (mask4 == 1) ? (val10 * val20) : +0.0
		tmp1 := (mask5 == 1) ? (val11 * val21) : +0.0
		tmp2 := tmp0 + tmp1
		r0 := (mask0 == 1) ? tmp2 : +0.0
		r1 := (mask1 == 1) ? tmp2 : +0.0 */
	extern __m128d _mm_dp_pd(__m128d val1, __m128d val2, const int mask);

	/*Packed integer 64-bit comparison, zeroing or filling with ones
	corresponding parts of result */
	//val1=(val10, val11), val2=(val20, val21)
	//则r0 = (val10 == val20) ? 0xffffffffffffffff : 0,
	//r1 = (val11 == val21) ? 0xffffffffffffffff : 0
	extern __m128i _mm_cmpeq_epi64(__m128i val1, __m128i val2);

	/* Min/max packed integer instructions*/
	//val1=(val10,...,val115), val2=(val20,...,val215)
	//则r0 = (val10 < val20) ? val10 : val20, ...,
	//r15 = (val115 < val215) ? val115 : val215
	extern __m128i _mm_min_epi8 (__m128i val1, __m128i val2);
	//val1=(val10,...,val115), val2=(val20,...,val215)
	//则r0 = (val10 > val20) ? val10 : val20, ...,
	//r15 = (val115 > val215) ? val115 : val215
	extern __m128i _mm_max_epi8 (__m128i val1, __m128i val2);
	//val1=(val10,...,val17), val2=(val20,...,val27), eight 16-bit unsigned integers
	//则r0 = (val10 < val20) ? val10 : val20, ...,
	//r7 = (val17 < val27) ? val17 : val27
	extern __m128i _mm_min_epu16(__m128i val1, __m128i val2);
	//val1=(val10,...,val17), val2=(val20,...,val27),eight 16-bit unsigned integers
	//则r0 = (val10 > val20) ? val10 : val20, ...,
	//r7 = (val17 > val27) ? val17 : val27
	extern __m128i _mm_max_epu16(__m128i val1, __m128i val2);
	//val1=(val10,...,val13), val2=(val20,...,val23)
	//则r0 = (val10 < val20) ? val10 : val20, ...,
	//r3 = (val13 < val23) ? val13 : val23
	extern __m128i _mm_min_epi32(__m128i val1, __m128i val2);
	//val1=(val10,...,val13), val2=(val20,...,val23)
	//则r0 = (val10 > val20) ? val10 : val20, ...,
	//r3 = (val13 > val23) ? val13 : val23
	extern __m128i _mm_max_epi32(__m128i val1, __m128i val2);
	//val1=(val10,...,val13), val2=(val20,...,val23), four 32-bit unsigned integers
	//则r0 = (val10 < val20) ? val10 : val20, ...,
	//r3 = (val13 < val23) ? val13 : val23
	extern __m128i _mm_min_epu32(__m128i val1, __m128i val2);
	//val1=(val10,...,val13), val2=(val20,...,val23), four 32-bit unsigned integers
	//则r0 = (val10 > val20) ? val10 : val20, ...,
	//r3 = (val13 > val23) ? val13 : val23
	extern __m128i _mm_max_epu32(__m128i val1, __m128i val2);

	/*Packed integer 32-bit multiplication with truncation
	of upper halves of results*/
	//a=(a0,...,a3), b=(b0,...,b3), 则r0=a0 * b0, ..., r3=a3 * b3
	//Only the lower 32-bits of each product are saved
	extern __m128i _mm_mullo_epi32(__m128i a, __m128i b);

	/*Packed integer 32-bit multiplication of 2 pairs of operands
	producing two 64-bit results */
	//a=(a0,a1,a2,a3), b=(b0,b1,b2,b3)
	//r0=low_half(a0*b0), r1=high_half(a0*b0),r2=low_half(a2*b2), r3=high_half(a2*b2)
	//The upper 32-bits of each quadword of the input parameters are not used
	extern __m128i _mm_mul_epi32(__m128i a, __m128i b);

	/*Packed integer 128-bit bitwise comparison.
	return 1 if (val ‘and‘ mask) == 0*/
	//则r = (mask & val) == 0, Generates a return value of 0 or 1
	extern int _mm_testz_si128(__m128i mask, __m128i val);

	/*Packed integer 128-bit bitwise comparison.
	return 1 if (val ‘and_not‘ mask) == 0 */
	//则r=1 if all the bits set in val are set in mask; otherwise 0
	//Generates a return value of 0 or 1
	extern int _mm_testc_si128(__m128i mask, __m128i val);

	/*Packed integer 128-bit bitwise comparison
	ZF = ((val ‘and‘ mask) == 0)  CF = ((val ‘and_not‘ mask) == 0)
	return 1 if both ZF and CF are 0 */
	//则 ZF := (mask & s2) == 0,CF := (~mask & s2) == 0, r = ~ZF & ~CF
	//Generates a return value of 0 or 1
	extern int _mm_testnzc_si128(__m128i mask, __m128i s2);

	/*Insert single precision float into packed single precision
	array element selected by index.
	The bits [7-6] of the 3d parameter define src index,
	the bits [5-4] define dst index, and bits [3-0] define zeroing
	mask for dst */
	/*	sx := ndx6-7
		sval := (sx == 0) ? src0 : ((sx == 1) ? src1 : ((sx == 2) ? src2 : src3))

		dx := ndx4-5
		r0 := (dx == 0) ? sval : dst0
		r1 := (dx == 1) ? sval : dst1
		r2 := (dx == 2) ? sval : dst2
		r3 := (dx == 3) ? sval : dst3

		zmask := ndx0-3
		r0 := (zmask0 == 1) ? +0.0 : r0
		r1 := (zmask1 == 1) ? +0.0 : r1
		r2 := (zmask2 == 1) ? +0.0 : r2
		r3 := (zmask3 == 1) ? +0.0 : r3 */
	extern __m128 _mm_insert_ps(__m128 dst, __m128 src, const int ndx);

	/*Extract binary representation of single precision float from
	packed single precision array element selected by index */
	//src=(src0, src1, src2, src3)
	//则r = (ndx == 0) ? src0 : ((ndx == 1) ? src1 : ((ndx == 2) ? src2 : src3))
	//Only the least significant two bits of ndx are used
	extern int _mm_extract_ps(__m128 src, const int ndx);

	/*Insert integer into packed integer array element
	selected by index */
	//则r0=(ndx == 0) ? s : dst0, ..., r15=(ndx == 15) ? s : dst15
	//Only the lowest 8 bits of s are used,
	//Only the least significant 4 bits of ndx are used
	extern __m128i _mm_insert_epi8 (__m128i dst, int s, const int ndx);
	//则r0=(ndx == 0) ? s : dst0, ..., r3=(ndx == 3) ? s : dst3
	//Only the least significant 2 bits of ndx are interpreted
	extern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx);
	//则r0=(ndx == 0) ? s : dst0, r1=(ndx == 1) ? s : dst1
	//Only the least significant bit of ndx is interpreted
	extern __m128i _mm_insert_epi64(__m128i dst, __int64 s, const int ndx);

	/*Extract integer from packed integer array element
	selected by index */
	//则r=(ndx == 0) ? src0 : ((ndx == 1) ? src1 : ...((ndx == 14) ? src14 : src15))
	//Only the least significant four bits of ndx are used
	//注意:The result is the unsigned equivalent of the appropriate 8-bits in parameter src
	extern int _mm_extract_epi8 (__m128i src, const int ndx);
	//则r=(ndx == 0) ? src0 : ((ndx == 1) ? src1 : ((ndx == 2) ? src2 : src3))
	//Only the least significant two bits of ndx are used.
	extern int _mm_extract_epi32(__m128i src, const int ndx);
	//则r = (ndx == 0) ? src0 : src1
	//Only the least significant bit of parameter ndx is used
	extern __int64 _mm_extract_epi64(__m128i src, const int ndx);

	/*Horizontal packed word minimum and its index in
	result[15:0] and result[18:16] respectively */
	//The lowest order 16 bits are the minimum value found in parameter shortValues.
	//The second-lowest order 16 bits are the index of the minimum value
	//found in parameter shortValues.
	extern __m128i _mm_minpos_epu16(__m128i shortValues);

	/* Packed/single float double precision rounding */
	//则r0=RND(val0), r1=RND(val1),详见参考文献1
	extern __m128d _mm_round_pd(__m128d val, int iRoundMode);
	//则r0=RND(val0), r1=dst1, 详见参考文献1
	// The lowest 64 bits are the result of the rounding function on val.
	//The higher order 64 bits are copied directly from input parameter dst
	extern __m128d _mm_round_sd(__m128d dst, __m128d val, int iRoundMode);

	/*Packed/single float single precision rounding */
	//则r0=RND(val0), r1=RND(val1), r2=RND(val2), r3=RND(val3),详见参考文献1
	extern __m128  _mm_round_ps(__m128  val, int iRoundMode);
	//则r0=RND(val0), r1=dst1, r2=dst2, r3=dst3,
	//The lowest 32 bits are the result of the rounding function on val.
	//The higher order 96 bits are copied directly from input parameter dst
	extern __m128  _mm_round_ss(__m128 dst, __m128  val, int iRoundMode);

	/*Packed integer sign-extension */
	//byteValues: A 128-bit parameter that contains four signed 8-bit integers
	//in the lower 32 bits, byteValues=(a0, a1, ..., a15)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xff : 0
		r2 := (a0 < 0) ? 0xff : 0
		r3 := (a0 < 0) ? 0xff : 0

		r4 := a1
		r5 := (a1 < 0) ? 0xff : 0
		r6 := (a1 < 0) ? 0xff : 0
		r7 := (a1 < 0) ? 0xff : 0

		r8 := a2
		r9 := (a2 < 0) ? 0xff : 0
		r10 := (a2 < 0) ? 0xff : 0
		r11 := (a2 < 0) ? 0xff : 0

		r12 := a3
		r13 := (a3 < 0) ? 0xff : 0
		r14 := (a3 < 0) ? 0xff : 0
		r15 := (a3 < 0) ? 0xff : 0 */
	extern __m128i _mm_cvtepi8_epi32 (__m128i byteValues);
	//shortValues: A 128-bit parameter that contains four signed 16-bit integers
	//in the lower 64 bits, shortValues=(a0, a1, ..., a7)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xffff : 0

		r2 := a1
		r3 := (a1 < 0) ? 0xffff : 0

		r4 := a2
		r5 := (a2 < 0) ? 0xffff : 0

		r6 := a3
		r7 := (a3 < 0) ? 0xffff : 0 */
	extern __m128i _mm_cvtepi16_epi32(__m128i shortValues);
	//byteValues: A 128-bit parameter that contains two signed 8-bit integers
	//in the lower 16 bits, byteValues=(a0, a1, ... , a15)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xff : 0
		r2 := (a0 < 0) ? 0xff : 0
		r3 := (a0 < 0) ? 0xff : 0
		r4 := (a0 < 0) ? 0xff : 0
		r5 := (a0 < 0) ? 0xff : 0
		r6 := (a0 < 0) ? 0xff : 0
		r7 := (a0 < 0) ? 0xff : 0

		r8 := a1
		r9 := (a1 < 0) ? 0xff : 0
		r10 := (a1 < 0) ? 0xff : 0
		r11 := (a1 < 0) ? 0xff : 0
		r12 := (a1 < 0) ? 0xff : 0
		r13 := (a1 < 0) ? 0xff : 0
		r14 := (a1 < 0) ? 0xff : 0
		r15 := (a1 < 0) ? 0xff : 0 */
	extern __m128i _mm_cvtepi8_epi64 (__m128i byteValues);
	//intValues: A 128-bit parameter that contains two signed 32-bit
	//integers in the lower 64 bits, intValues=(a0, a1, a2, a3)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xffffffff : 0
		r2 := a1
		r3 := (a1 < 0) ? 0xffffffff : 0*/
	extern __m128i _mm_cvtepi32_epi64(__m128i intValues);
	//shortValues:A 128-bit parameter that contains two signed 16-bit integers
	//in the lower 32 bits, shortValues=(a0, a1, ..., a7)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xffff : 0
		r2 := (a0 < 0) ? 0xffff : 0
		r3 := (a0 < 0) ? 0xffff : 0

		r4 := a1
		r5 := (a1 < 0) ? 0xffff : 0
		r6 := (a1 < 0) ? 0xffff : 0
		r7 := (a1 < 0) ? 0xffff : 0*/
	extern __m128i _mm_cvtepi16_epi64(__m128i shortValues);
	//byteValues:A 128-bit parameter that contains eight signed 8-bit integers
	//in the lower 64 bits, byteValues=(a0, a1, ..., a15)
	/*则r0 := a0
		r1 := (a0 < 0) ? 0xff : 0
		r2 := a1
		r3 := (a1 < 0) ? 0xff : 0
		...
		r14 := a7
		r15 := (a7 < 0) ? 0xff : 0*/
	extern __m128i _mm_cvtepi8_epi16 (__m128i byteValues);

	/*Packed integer zero-extension*/
	//byteValues:A 128-bit parameter that contains four unsigned 8-bit integers
	//in the lower 32 bits, byteValues=(a0, a1, ... , a15)
	/*则r0 := a0
		r1 := 0
		r2 := 0
		r3 := 0

		r4 := a1
		r5 := 0
		r6 := 0
		r7 := 0

		r8 := a2
		r9 := 0
		r10 := 0
		r11 := 0

		r12 := a3
		r13 := 0
		r14 := 0
		r15 := 0*/
	extern __m128i _mm_cvtepu8_epi32 (__m128i byteValues);
	//shortValues:A 128-bit parameter that contains four unsigned 16-bit integers
	//in the lower 64 bits, shortValues=(a0, a1, ... , a7)
	/*则r0 := a0
		r1 := 0

		r2 := a1
		r3 := 0

		r4 := a2
		r5 := 0

		r6 := a3
		r7 := 0*/
	extern __m128i _mm_cvtepu16_epi32(__m128i shortValues);
	//shortValues:A 128-bit parameter that contains two unsigned 8-bit integers
	//in the lower 16 bits, shortValues=(a0, a1, ..., a15)
	/*则r0 := a0
		r1 := 0
		r2 := 0
		r3 := 0
		r4 := 0
		r5 := 0
		r6 := 0
		r7 := 0

		r8 := a1
		r9 := 0
		r10 := 0
		r11 := 0
		r12 := 0
		r13 := 0
		r14 := 0
		r15 := 0*/
	extern __m128i _mm_cvtepu8_epi64 (__m128i shortValues);
	//intValues:A 128-bit parameter that contains two unsigned 32-bit integers
	//in the lower 64 bits, intValues=(a0, a1, a2, a3)
	/*则r0 = a0
		r1 = 0
		r2 = a1
		r3 = 0*/
	extern __m128i _mm_cvtepu32_epi64(__m128i intValues);
	//shortValues:A 128-bit parameter that contains two unsigned 16-bit integers
	//in the lower 32 bits, shortValues=(a0, a1, ... , a7)
	/*则r0 := a0
		r1 := 0
		r2 := 0
		r3 := 0

		r4 := a1
		r5 := 0
		r6 := 0
		r7 := 0*/
	extern __m128i _mm_cvtepu16_epi64(__m128i shortValues);
	//byteValues:A 128-bit parameter that contains eight unsigned 8-bit integers
	//in the lower 64 bits, byteValues=(a0, a1, ... , a15)
	/*则r0 := a0
		r1 := 0
		r2 := a1
		r3 := 0
		...
		r14 := a7
		r15 := 0*/
	extern __m128i _mm_cvtepu8_epi16 (__m128i byteValues);

	/*Pack 8 double words from 2 operands into 8 words of result
	with unsigned saturation */
	//val1=(val10,...,vall3), val2=(val20, ..., val23)
	/*则r0 := (val10 < 0) ? 0 : ((val10 > 0xffff) ? 0xffff : val10)
		r1 := (val11 < 0) ? 0 : ((val11 > 0xffff) ? 0xffff : val11)
		r2 := (val12 < 0) ? 0 : ((val12 > 0xffff) ? 0xffff : val12)
		r3 := (val13 < 0) ? 0 : ((val13 > 0xffff) ? 0xffff : val13)
		r4 := (val20 < 0) ? 0 : ((val20 > 0xffff) ? 0xffff : val20)
		r5 := (val21 < 0) ? 0 : ((val21 > 0xffff) ? 0xffff : val21)
		r6 := (val22 < 0) ? 0 : ((val22 > 0xffff) ? 0xffff : val22)
		r7 := (val23 < 0) ? 0 : ((val23 > 0xffff) ? 0xffff : val23)*/
	extern __m128i _mm_packus_epi32(__m128i val1, __m128i val2);

	/*Sum absolute 8-bit integer difference of adjacent groups of 4 byte
	integers in operands. Starting offsets within operands are
	determined by mask */
	//s1, s2: sixteen 8-bit unsigned integers
	// msk0, msk1, and msk2 are the three least significant bits of parameter msk
	/*则i = msk2 * 4
		j = msk0-1 * 4
		for (k = 0; k < 8; k = k + 1) {
		t0 = abs(s1[i + k + 0] - s2[j + 0])
		t1 = abs(s1[i + k + 1] - s2[j + 1])
		t2 = abs(s1[i + k + 2] - s2[j + 2])
		t3 = abs(s1[i + k + 3] - s2[j + 3])
		r[k] = t0 + t1 + t2 + t3
		}*/
	extern __m128i _mm_mpsadbw_epu8(__m128i s1, __m128i s2, const int msk);

	/*
	* Load double quadword using non-temporal aligned hint
	*/
	//This instruction loads data from a specified address.The memory source must be
	//16-byte aligned because the return value consists of sixteen bytes.则r=*v1
	extern __m128i _mm_stream_load_si128(__m128i* v1);

nmmintrin.h为SSE4.2头文件,其文件中各函数的介绍:

	/*
	* Intrinsics for text/string processing.
	*/
	//Either the computed mask of MaxSize bits or its expansion to a 128-bit parameter.
	//If the return value is expanded, each bit of the result mask is expanded to a
	//byte or a word.详见参考文献2
	extern __m128i _mm_cmpistrm (__m128i a, __m128i b, const int mode);
	//An integer between 0 and Maxsize. MaxSize when the computed mask equals 0.
	//Otherwise, the index of the leftmost or rightmost bit set to 1 in this mask.
	//详见参考文献2
	extern int     _mm_cmpistri (__m128i a, __m128i b, const int mode);
	//Either the computed mask of MaxSize bits or its expansion to a 128-bit parameter.
	//If the return value is expanded, each bit of the result mask is expanded to
	//a byte or a word.详见参考文献3
	extern __m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int mode);
	//An integer that ranges between 0 and MaxSize. Maxsize is returned when the
	//resulting bitmask is equal to 0. Otherwise, the index of either the leftmost
	//or rightmost bit set to 1 in this mask.详见参考文献3
	extern int     _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int mode);

	/*
	* Intrinsics for text/string processing and reading values of EFlags.
	*/
	//Returns one if the null character occurs in b. Otherwise, zero. When one is
	//returned, it means that b contains the ending fragment of the string that is
	//being compared.详见参考文献2
	extern int     _mm_cmpistrz (__m128i a, __m128i b, const int mode);
	//Zero if the resulting mask is equal to zero. Otherwise, one.
	//详见参考文献2
	extern int     _mm_cmpistrc (__m128i a, __m128i b, const int mode);
	//One if the null character occurs in a. Otherwise, zero. When one is returned,
	//it means that a contains the ending fragment of the string that is being compared.
	//详见参考文献2
	extern int     _mm_cmpistrs (__m128i a, __m128i b, const int mode);
	//bit0 of the resulting bitmask.详见参考文献2
	extern int     _mm_cmpistro (__m128i a, __m128i b, const int mode);
	//One if b is does not contain the null character and the resulting mask is
	//equal to zero. Otherwise, zero. 详见参考文献2
	extern int     _mm_cmpistra (__m128i a, __m128i b, const int mode);
	//One if the absolute value of lb is less than MaxSize. Otherwise, zero.详见参考文献3
	extern int     _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int mode);
	//Zero if the resulting mask is equal to zero. Otherwise, one.详见参考文献3
	extern int     _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int mode);
	//One if the absolute value of la is less than MaxSize. Otherwise, zero.详见参考文献3
	extern int     _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int mode);
	//bit0 of the resulting bitmask. 详见参考文献3
	extern int     _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int mode);
	//One if the absolute value of lb is larger than or equal to MaxSize and the
	//resulting mask is equal to zero. Otherwise, zero.详见参考文献3
	extern int     _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int mode);

	/*
	* Packed integer 64-bit comparison, zeroing or filling with ones
	* corresponding parts of result
	*/
	//val1=(val10, val11), val2=(val20, val21)
	//则,r0 = (val10 > val20) ? 0xffffffffffffffff : 0x0
	//	 r1 = (val11 > val21) ? 0xffffffffffffffff : 0x0
	extern __m128i _mm_cmpgt_epi64(__m128i val1, __m128i val2);

	/*
	* Calculate a number of bits set to 1
	*/
	//The number of bits set to one in v
	extern int _mm_popcnt_u32(unsigned int v);
	//The number of bits set to one in v
	extern __int64 _mm_popcnt_u64(unsigned __int64 v);

	/*
	* Accumulate CRC32 (polynomial 0x11EDC6F41) value
	*/
	//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,
	//r = crc + CRC-32C(v)
	extern unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v);
	//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,
	//r = crc + CRC-32C(v)
	extern unsigned int _mm_crc32_u16(unsigned int crc, unsigned short v);
	//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,
	//r = crc + CRC-32C(v)
	extern unsigned int _mm_crc32_u32(unsigned int crc, unsigned int v);
	//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,
	//r = crc + CRC-32C(v)
	extern unsigned __int64 _mm_crc32_u64(unsigned __int64 crc, unsigned __int64 v);

参考文献:

1、http://msdn.microsoft.com/zh-cn/library/bb514044(v=vs.100).aspx

2、http://msdn.microsoft.com/zh-cn/library/bb513993(v=vs.100).aspx

3、http://msdn.microsoft.com/zh-cn/library/bb514048(v=vs.100).aspx

再分享一下我老师大神的人工智能教程吧。零基础!通俗易懂!风趣幽默!还带黄段子!希望你也加入到我们人工智能的队伍中来!https://blog.csdn.net/jiangjunshow

原文地址:https://www.cnblogs.com/xkiwnchwhd/p/10318860.html

时间: 2024-11-05 18:40:47

SSE4 1和SSE4 2 Intrinsics各函数介绍的相关文章

SSE2 Intrinsics各函数介绍

SIMD相关头文件包括: //#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#include <dvec.h>//SSE2(also include fvec.h) #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmi

Neon Intrinsics各函数介绍

#ifndef __ARM_NEON__ #error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h #endif /*(1).正常指令:生成大小同样且类型通常与操作数向量同样的结果向量: (2).长指令:对双字向量操作数运行运算,生成四字向量的结果.所生成的元素通常是操作数元素宽度的两倍, 并属于同一类型. (3).宽指令:一个双字向量操作数和一个四字向量操作数运

0-C相关01:NSlog函数介绍。

  NSlog()函数介绍: 首先:NSlog()函数是cocoa的框架中提供的一个方法: 下图中最上方是它在Xcode中的路径: : 同样都是输出函数.下边我们来看一下,在O-C中NSlog()和在 c 语言中的printf的一些不同: 1.nslog 和printf都可以输出字符串到控制台.@"1213244" @开头表示oc的字符串. 2.NSlog()在打印时能自带一次自动换行,后者没有,想换行需要\手动添加"\n".当然在NSlog()中也可以手动添加&q

1.socket编程:socket编程,网络字节序,函数介绍,IP地址转换函数,sockaddr数据结构,网络套接字函数,socket相关函数,TCP server和client

 1  Socket编程 socket这个词可以表示很多概念: 在TCP/IP协议中,"IP地址+TCP或UDP端口号"唯一标识网络通讯中的一个进程,"IP 地址+端口号"就称为socket. 在TCP协议中,建立连接的两个进程各自有一个socket来标识,那么这两个socket组成的socket pair就唯一标识一个连接.socket本身有"插座"的意思,因此用来描述网络连 接的一对一关系. TCP/IP协议最早在BSD UNIX上实现,

第16课-数据库开发及ado.net-数据库SQl,创建数据库和表,增删改语句,约束,top和Distinct,聚合函数介绍

第16课-数据库开发及ado.net 数据库SQl,创建数据库和表,增删改语句,约束,top和Distinct,聚合函数介绍 SQL语句入门(脚本.命令) SQL全名是结构化查询语言(Structured Query Language) SOL语句是和DBMS“交谈”专用的语言,不同的DBMS都认SQL语法. Sql中字符串使用单引号:通过写俩个单引号来转义一个单引号. Sql中的注释“——” 单行注释比较好 判断俩个数据是否相等使用=(单等号) 在sql语句中sql代码不区分大小写 SQL主要

JQuery AJAX函数介绍

jQuery 库拥有完整的 Ajax 兼容套件.其中的函数和方法允许我们在不刷新浏览器的情况下从服务器加载数据. 函数介绍 JOuery.ajax():执行异步HTTP(Ajax)请求. .ajaxComplete():当Ajax请求完成时注册要调用的处理程序.这是一个Ajax事件. .ajaxError:当Ajax请求完成且出现错误时注册要调用的处理程序.这是一个Ajax事件. .ajaxSend():在Ajax请求发送之前显示一条消息. jQuery.ajaxSetup():设置将来的Aja

php session_id()函数介绍及代码实例

session_id()功能: 获取设置当前回话ID. 函数说明: string session_id ([ string $id ] ) 参数: 如果指定了参数$id,那么函数会替换当前的回话id. session_id()函数必须在session_start()函数之前调用. 返回值: session_id返回当前会话id字符串.如果当前没有产生会话,则返回空字符串"". 代码示例1: 输出 session_id() 1 2 3 4 <?php     session_sta

postgis经常使用函数介绍(一)

概述: 在进行地理信息系统开发的过程中,经常使用的空间数据库有esri的sde,postgres的postgis以及mySQL的mysql gis等等,在本文.给大家介绍的是有关postgis的一些经常使用函数的意思以及使用. 说明: 本文中所使用postgres的版本号为9.4.0.你可从我的百度网盘获取相关的安装包,安装包地址例如以下: postgres:http://pan.baidu.com/s/1o69WORK postgres空间扩展:http://pan.baidu.com/s/1

PHP ob_start() 函数介绍

ob_start() 函数介绍: http://www.nowamagic.net/php/php_ObStart.php ob_start()作用: http://zhidao.baidu.com/link?url=qhOcqHCNitPuSSKZOVI8bsW_eZaJYqZJ1cLctQDTWKvnBbV4pQVyYvfi3-v4whainj_WNTiQw2KPex6ZUGqR0IiujRWV79PtOh3jAPUwAEu ob相关函数 http://www.nowamagic.net/l