Petr Kobalicek

Software engineer focusing on writing

high-performance code in C++ & Asm

high-performance code in C++ & Asm

This article provides alternative implementation of some instructions that were introduced in SSSE3 and SSE4.1 extensions in baseline SSE2.

SSSE3 extensions introduced instructions for computing packed absolute values of 8-bit, 16-bit, and 32-bit integers. It's possible to implement PABSW/PABSD in pure SSE2, and also a missing PABSQ (packed absolute value of 64-bit integers), which was introduced pretty recently by AVX512-F.

Let's start with a straightforward implementation in C first:

```
inline uint32_t abs32(int32_t x) {
return x >= 0 ? uint32_t(x) : (uint32_t(0)-uint32_t(x));
}
```

Although it contains branches C++ compilers are usually able to recognize such code and create an optimized branch-less version of it either via masking or by taking advantage of how 2s complement negation works. The idea is to shift sign bit (which would be only set if the number is negative) to all other bits by arithmetic shift, to XOR it with the input, and next to subtract it from the input as well. This approach is essentially a conditional negation of a number. Considering that a `mask`

is all zeros (positive number) or all ones (negative number), the conditional/masked negations would be implemented as `(x ^ mask) - mask`

.

The C++ code can be directly translated to SSE2 for 16-bit and 32-bit integer sizes:

```
; SSE2 compatible PABSW implementation
; xmm0 = in|out
; xmm7 = temporary (mask)
movdqa xmm7, xmm0 ; Move xmm0 to temporary
psraw xmm7, 15 ; Arithmetic shift right (creates the mask)
pxor xmm0, xmm7 ; Bit-not if mask is all ones
psubw xmm0, xmm7 ; Add one if mask is all ones
; SSE2 compatible PABSD implementation
; xmm0 = in|out
; xmm7 = temporary (mask)
movdqa xmm7, xmm0 ; Move xmm0 to temporary
psrad xmm7, 31 ; Arithmetic shift right (creates the mask)
pxor xmm0, xmm7 ; Bit-not if mask is all ones
psubd xmm0, xmm7 ; Add one if mask is all ones
```

64-bit packed absolute value is trickier as there is no PSRAQ instruction in SSE2 (VPSRAQ was first introduced in AVX512-F), however, we can shuffle the input a bit and use PSRAD again:

```
; SSE2 compatible PABSQ implementation
; xmm0 = in|out
; xmm7 = temporary (mask)
pshufd xmm7, xmm0, 0xF5 ; Like _MM_SHUFFLE(3, 3, 1, 1)
psrad xmm7, 31 ; Arithmetic shift right (creates the mask)
pxor xmm0, xmm7 ; Bit-not if mask is all ones
psubq xmm0, xmm7 ; Add one if mask is all ones
```

These were straightforward translations based on the initial C++ code. However, there is a better way of implementing PABSW and there is also a way of implementing PABSB without any shifts (because there is no packed shift that operates on 8-bit entities). Since absolute value could be also written as `max(x, -x)`

we can use packed min/max to implement PABSB and PABSW:

```
; SSE2 compatible PABSW implementation
; xmm0 = in|out
; xmm7 = temporary (mask)
pxor xmm7, xmm7 ; Zero xmm7 (temporary)
psubw xmm7, xmm0 ; Negate all input values
pmaxsw xmm0, xmm7 ; Select all positive values
; SSE2 compatible PABSB implementation
; xmm0 = in|out
; xmm7 = temporary (mask)
pxor xmm7, xmm7 ; Zero xmm7 (temporary)
psubb xmm7, xmm0 ; Negate all input values
pminub xmm0, xmm7 ; Select all positive values
```

The PABSW implementation is straightforward and I have nothing to add, however, PABSB implementation is interesting as it workarounds the missing PMAXSB instruction (which was introduced in SSE4.1) by using PMINUB instead, which works for us based on the knowledge about both inputs - selecting the minimum unsigned value is the same as selecting the maximum signed value in our case.

C++ intrinsics version:

```
// SSE2 compatible PABSB implementation.
static inline __m128i sse2_pabsb(const __m128i& x) {
__m128i zero = _mm_setzero_si128();
return _mm_min_epu8(x, _mm_sub_epi8(zero, x));
}
// SSE2 compatible PABSW implementation.
static inline __m128i sse2_pabsw(const __m128i& x) {
__m128i zero = _mm_setzero_si128();
return _mm_max_epi16(x, _mm_sub_epi16(zero, x));
}
// SSE2 compatible PABSD implementation.
static inline __m128i sse2_pabsd(const __m128i& x) {
__m128i msk = _mm_srai_epi32(x, 31);
return _mm_sub_epi32(_mm_xor_si128(x, msk), msk);
}
// SSE2 compatible PABSQ implementation.
static inline __m128i sse2_pabsq(const __m128i& x) {
__m128i msk = _mm_srai_epi32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)), 31);
return _mm_sub_epi64(_mm_xor_si128(x, msk), msk);
}
```

These instructions were introduced by SSE4.1 extensions, which adds the remaining min/max functions that were missing in the baseline SSE2, which only contains signed versions (PMINSW and PMAXSW).

The solution is to use PSUBUSW (packed subtract with unsigned saturation) and then to subtract the result from the original value to get the packed unsigned minimum. This trick of course only works for packed uint16_t operations as X86 SIMD doesn't have instructions to perform packed saturated addition|subtraction of elements greater than 16 bits.

```
; SSE2 compatible PMINUW implementation
; xmm0 = in|out
; xmm1 = in
; xmm7 = temporary
movdqa xmm7, xmm0 ; Move xmm0 to temporary
psubusw xmm7, xmm1 ; Subtract with unsigned saturation
psubw xmm0, xmm7 ; Subtract (no saturation, would never overflow here)
```

Why it works? If we perform `a-b`

with unsigned saturation we get either zero, which means that `b`

is either equal or greater than `a`

, or some non-zero value, which means that `a`

is greater than `b`

, and the value is their difference (unsigned). Based on these we can subtract that value from the original `a`

and get our unsigned minimum.

The machine code of pmaxuw implementation would be much simpler:

```
; SSE2 compatible PMAXUW implementation
; xmm0 = in|out
; xmm1 = in
psubusw xmm0, xmm1 ; Subtract with unsigned saturation
paddw xmm0, xmm1 ; Add (no saturation, would never overflow here)
```

C++ intrinsics version:

```
// SSE2 compatible PMINUW implementation.
static inline __m128i sse2_pminuw(const __m128i& x, const __m128i& y) {
return _mm_sub_epi16(x, _mm_subs_epu16(x, y));
}
// SSE2 compatible PMAXUW implementation.
static inline __m128i sse2_pmaxuw(const __m128i& x, const __m128i& y) {
return _mm_add_epi16(_mm_subs_epu16(x, y), y);
}
```

Floating point rounding was always a painful job on X86 targets, because of the lacking support from HW point of view. Historically, X86 offered only instructions to round by using the current rounding mode, which is in most cases assumed to be round-to-even. Round-to-even is the best option for intermediate computations (all floating computations you do would be rounded this way by default), however, it's probably the least used "explicit" rounding mode. In C/C++ usually the most used rounding function is `floor()`

, `trunc()`

, and `ceil()`

.

The following C++ code provides rounding functions of a double precision floating point that use baseline SSE2 instructions:

```
#include <stdint.h>
#include <emmintrin.h>
// SSE2 compatible blend instruction.
static inline __m128d sse2_blend(const __m128d& x, const __m128d& y, const __m128d& mask) {
return _mm_or_pd(_mm_and_pd(mask, x),
_mm_and_pd(y, mask));
}
// SSE2 compatible ROUNDSD (nearby) implementation.
static inline double sse2_nearby(double x) {
__m128d src = _mm_set_sd(x);
__m128d maxn = _mm_set_sd(4503599627370496.0);
__m128d magic = _mm_set_sd(6755399441055744.0);
__m128d msk = _mm_cmpnlt_sd(src, maxn);
__m128d rnd = _mm_sub_sd(_mm_add_sd(src, magic), magic);
return _mm_cvtsd_f64(sse2_blend(rnd, src, msk));
}
// SSE2 compatible ROUNDSD (trunc) implementation.
static inline double sse2_trunc(double x) {
static const uint64_t kSepMask[1] = { 0x7FFFFFFFFFFFFFFFu };
__m128d src = _mm_set_sd(x);
__m128d sep = _mm_load_sd(reinterpret_cast<const double*>(kSepMask));
__m128d src_abs = _mm_and_pd(src, sep);
__m128d sign = _mm_andnot_pd(sep, src);
__m128d maxn = _mm_set_sd(4503599627370496.0);
__m128d magic = _mm_set_sd(6755399441055744.0);
__m128d msk = _mm_or_pd(_mm_cmpnlt_sd(src_abs, maxn), sign);
__m128d rnd = _mm_sub_sd(_mm_add_sd(src_abs, magic), magic);
__m128d maybeone = _mm_and_pd(_mm_cmplt_sd(src_abs, rnd), _mm_set_sd(1.0));
return _mm_cvtsd_f64(sse2_blend(_mm_sub_sd(rnd, maybeone), src, msk));
}
// SSE2 compatible ROUNDSD (floor) implementation.
static inline double sse2_floor(double x) {
__m128d src = _mm_set_sd(x);
__m128d maxn = _mm_set_sd(4503599627370496.0);
__m128d magic = _mm_set_sd(6755399441055744.0);
__m128d msk = _mm_cmpnlt_sd(src, maxn);
__m128d rnd = _mm_sub_sd(_mm_add_sd(src, magic), magic);
__m128d maybeone = _mm_and_pd(_mm_cmplt_sd(src, rnd), _mm_set_sd(1.0));
return _mm_cvtsd_f64(sse2_blend(_mm_sub_sd(rnd, maybeone), src, msk));
}
// SSE2 compatible ROUNDSD (ceil) implementation.
static inline double sse2_ceil(double x) {
__m128d src = _mm_set_sd(x);
__m128d maxn = _mm_set_sd(4503599627370496.0);
__m128d magic = _mm_set_sd(6755399441055744.0);
__m128d msk = _mm_cmpnlt_sd(src, maxn);
__m128d rnd = _mm_sub_sd(_mm_add_sd(src, magic), magic);
__m128d maybeone = _mm_and_pd(_mm_cmpnle_sd(src, rnd), _mm_set_sd(1.0));
return _mm_cvtsd_f64(sse2_blend(_mm_add_sd(rnd, maybeone), src, msk));
}
```

The code can actually be ported to use 2 double precision floating points instead of a scalar and thus to round 2 values at once.

Today's X86 CPUs will most likely all have SSSE3 and SSE4.1 extensions so it's most likely that you won't need these tricks. However, they could still be handly if you target a baseline SSE2 instruction set or some totally different instruction set that has similar limitations.