Tags
 IOS SQL HTML C RUBY-ON-RAILS MYSQL ASP.NET DEVELOPMENT RUBY .NET LINUX SQL-SERVER REGEX WINDOWS ALGORITHM ECLIPSE VISUAL-STUDIO STRING SVN PERFORMANCE APACHE-FLEX UNIT-TESTING SECURITY LINQ UNIX MATH EMAIL OOP LANGUAGE-AGNOSTIC VB6

Most efficient formula for unpacking 16-bit BCD? (e.g. 0x1234 to 0x01020304)

By : Yasmin Shaaban
Date : September 14 2020, 09:00 PM
With these it helps Here is an alternative way, with fewer operations but a longer critical path, based on the binary decomposition of the move-distance of the nibbles (moving nibbles that move by 8 or 12 steps together by 8, moving nibbles that move a distance of 4 or 12 together by 4).
code :
``````x = bcd
x = ((x & 0xFF00) << 8) | (x & 0xFF)
x = ((x & 0x00F000F0) << 4) | (x & 0x000F000F)
``````
``````// start
0000ABCD
// move A and B by 8
00AB00CD
// move A and C by 4
0A0B0C0D
``````

Share :

Convert 0x1234 to 0x11223344

By : Faizan Bro
Date : March 29 2020, 07:55 AM
I hope this helps you . This can be done using SSE2 as follows:
code :
``````void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) {
__m128i const mul0 = _mm_set1_epi16(0x0011);
__m128i const mul1 = _mm_set1_epi16(0x1000);
__m128i       v;

v = _mm_cvtsi64_si128(in); // Move the 64-bit value to a 128-bit register
v = _mm_unpacklo_epi8(v, v);  // 0x12   -> 0x1212
v = _mm_and_si128(v, mask);   // 0x1212 -> 0x1002
v = _mm_mullo_epi16(v, mul0); // 0x1002 -> 0x1022
v = _mm_mulhi_epu16(v, mul1); // 0x1022 -> 0x0102
v = _mm_mullo_epi16(v, mul0); // 0x0102 -> 0x1122

outLo = _mm_extract_epi64(v, 0);
outHi = _mm_extract_epi64(v, 1);
}
``````
``````ExpandOrig:               56.234 seconds  // From asker's original question
ExpandSmallLUT:           30.209 seconds  // From Dmitry's answer
ExpandLookupSmallOneLUT:  33.689 seconds  // from Dmitry's answer
ExpandLookupLarge:        51.312 seconds  // A straightforward lookup table
ExpandAShelly:            43.829 seconds  // From AShelly's answer
ExpandAShellyMulOp:       43.580 seconds  // AShelly's answer with an optimization
ExpandSSE4:               17.854 seconds  // My original SSE4 answer
ExpandSSE4Unroll:         17.405 seconds  // My original SSE4 answer with loop unrolling
ExpandSSE2:               17.281 seconds  // My current SSE2 answer
ExpandSSE2Unroll:         17.152 seconds  // My current SSE2 answer with loop unrolling
``````
``````#define DATA_SIZE_IN  ((unsigned)(1024 * 1024 * 128))
#define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN))
#define RERUN_COUNT   500

#include <cstdlib>
#include <ctime>
#include <iostream>
#include <utility>
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4

void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;

// Do computation
u  =   (u & 0x00FF) << 4
| (u & 0x000F)
| (u & 0x0FF0) << 8
| (u & 0xFF00) << 12
| (u & 0xF000) << 16;
v  =   (v & 0x00FF) << 4
| (v & 0x000F)
| (v & 0x0FF0) << 8
| (v & 0xFF00) << 12
| (v & 0xF000) << 16;

// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}

unsigned LutLo[256],
LutHi[256];
void MakeLutLo(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x        = i;
x        = ((x & 0xF0) << 4) | (x & 0x0F);
x       |= (x << 4);
LutLo[i] = x;
}
}
void MakeLutHi(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x        = i;
x        = ((x & 0xF0) << 20) | ((x & 0x0F) << 16);
x       |= (x << 4);
LutHi[i] = x;
}
}

void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;

// Do computation
u = LutHi[u >> 8] | LutLo[u & 0xFF];
v = LutHi[v >> 8] | LutLo[v & 0xFF];

// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}

void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;

// Do computation
u = ((LutLo[u >> 8] << 16) | LutLo[u & 0xFF]);
v = ((LutLo[v >> 8] << 16) | LutLo[v & 0xFF]);

// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in  += 4;
out += 8;
} while (in != past);
}

unsigned LutLarge[256 * 256];
void MakeLutLarge(void) {
for (unsigned i = 0; i < (256 * 256); ++i)
LutLarge[i] = LutHi[i >> 8] | LutLo[i & 0xFF];
}

void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;

// Do computation
u = LutLarge[u];
v = LutLarge[v];

// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}

void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v, w, x;
do {
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;

// Do computation
w  = (((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00);
x  = (((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00);
w += w * 0x10;
x += x * 0x10;

// Store data
*(unsigned*)(out)      = w;
*(unsigned*)(out + 4)  = x;
in                    += 4;
out                   += 8;
} while (in != past);
}

void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;

// Do computation
u = ((((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
v = ((((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;

// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}

void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
mul = _mm_set1_epi16(0x0011);
__m128i       u, v, w, x;
do {
// Read input into low 8 bytes of u and v

v = _mm_unpackhi_epi8(u, u);      // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);      // Do it again for v
w = _mm_srli_epi16(u, 4);         // Copy the value into w and shift it right half a byte
x = _mm_srli_epi16(v, 4);         // Do it again for v
u = _mm_blendv_epi8(u, w, mask0); // Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte
v = _mm_blendv_epi8(v, x, mask0); // Do it again for v
u = _mm_and_si128(u, mask1);      // Clear the all the upper nibbles
v = _mm_and_si128(v, mask1);      // Do it again for v
u = _mm_mullo_epi16(u, mul);      // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v = _mm_mullo_epi16(v, mul);      // Do it again for v

// Write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}

void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
mul    = _mm_set1_epi16(0x0011);
__m128i       u0, v0, w0, x0,
u1, v1, w1, x1,
u2, v2, w2, x2,
u3, v3, w3, x3;
do {
// Read input into low 8 bytes of u and v
u1 = _mm_load_si128((__m128i const*)(in + 16));
u2 = _mm_load_si128((__m128i const*)(in + 32));
u3 = _mm_load_si128((__m128i const*)(in + 48));

v0 = _mm_unpackhi_epi8(u0, u0);      // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);      // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);      // Do it again
u1 = _mm_unpacklo_epi8(u1, u1);      // Again for u1
v2 = _mm_unpackhi_epi8(u2, u2);      // Again for v1
u2 = _mm_unpacklo_epi8(u2, u2);      // Again for u2
v3 = _mm_unpackhi_epi8(u3, u3);      // Again for v2
u3 = _mm_unpacklo_epi8(u3, u3);      // Again for u3
w0 = _mm_srli_epi16(u0, 4);          // Copy the value into w and shift it right half a byte
x0 = _mm_srli_epi16(v0, 4);          // Do it again for v
w1 = _mm_srli_epi16(u1, 4);          // Again for u1
x1 = _mm_srli_epi16(v1, 4);          // Again for v1
w2 = _mm_srli_epi16(u2, 4);          // Again for u2
x2 = _mm_srli_epi16(v2, 4);          // Again for v2
w3 = _mm_srli_epi16(u3, 4);          // Again for u3
x3 = _mm_srli_epi16(v3, 4);          // Again for v3
u0 = _mm_blendv_epi8(u0, w0, mask0); // Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte
v0 = _mm_blendv_epi8(v0, x0, mask0); // Do it again for v
u1 = _mm_blendv_epi8(u1, w1, mask0); // Again for u1
v1 = _mm_blendv_epi8(v1, x1, mask0); // Again for v1
u2 = _mm_blendv_epi8(u2, w2, mask0); // Again for u2
v2 = _mm_blendv_epi8(v2, x2, mask0); // Again for v2
u3 = _mm_blendv_epi8(u3, w3, mask0); // Again for u3
v3 = _mm_blendv_epi8(v3, x3, mask0); // Again for v3
u0 = _mm_and_si128(u0, mask1);       // Clear the all the upper nibbles
v0 = _mm_and_si128(v0, mask1);       // Do it again for v
u1 = _mm_and_si128(u1, mask1);       // Again for u1
v1 = _mm_and_si128(v1, mask1);       // Again for v1
u2 = _mm_and_si128(u2, mask1);       // Again for u2
v2 = _mm_and_si128(v2, mask1);       // Again for v2
u3 = _mm_and_si128(u3, mask1);       // Again for u3
v3 = _mm_and_si128(v3, mask1);       // Again for v3
u0 = _mm_mullo_epi16(u0, mul);       // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v0 = _mm_mullo_epi16(v0, mul);       // Do it again for v
u1 = _mm_mullo_epi16(u1, mul);       // Again for u1
v1 = _mm_mullo_epi16(v1, mul);       // Again for v1
u2 = _mm_mullo_epi16(u2, mul);       // Again for u2
v2 = _mm_mullo_epi16(v2, mul);       // Again for v2
u3 = _mm_mullo_epi16(u3, mul);       // Again for u3
v3 = _mm_mullo_epi16(v3, mul);       // Again for v3

// Write output
_mm_store_si128((__m128i*)(out      ), u0);
_mm_store_si128((__m128i*)(out +  16), v0);
_mm_store_si128((__m128i*)(out +  32), u1);
_mm_store_si128((__m128i*)(out +  48), v1);
_mm_store_si128((__m128i*)(out +  64), u2);
_mm_store_si128((__m128i*)(out +  80), v2);
_mm_store_si128((__m128i*)(out +  96), u3);
_mm_store_si128((__m128i*)(out + 112), v3);
in  += 64;
out += 128;
} while (in != past);
}

void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) {
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i       u, v;
do {
// Read input into low 8 bytes of u and v

v = _mm_unpackhi_epi8(u, u);      // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);      // Do it again for v

u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
u = _mm_mulhi_epu16(u, mul1);     // This can also be done with a right shift of 4 bits, but this seems to mesure faster
v = _mm_mulhi_epu16(v, mul1);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);

// write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}

void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i       u0, v0,
u1, v1;
do {
// Read input into low 8 bytes of u and v
u1 = _mm_load_si128((__m128i const*)(in + 16));

v0 = _mm_unpackhi_epi8(u0, u0);      // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);      // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);      // Do it again
u1 = _mm_unpacklo_epi8(u1, u1);      // Again for u1

u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);

u0 = _mm_mulhi_epu16(u0, mul1);
v0 = _mm_mulhi_epu16(v0, mul1);
u1 = _mm_mulhi_epu16(u1, mul1);
v1 = _mm_mulhi_epu16(v1, mul1);

u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);

// write output
_mm_store_si128((__m128i*)(out     ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);

in  += 32;
out += 64;
} while (in != past);
}

void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const zero      = _mm_setzero_si128(),
v0F0F     = _mm_set1_epi32(0x0F0F),
vF0F0     = _mm_set1_epi32(0xF0F0),
v0101     = _mm_set1_epi32(0x0101),
v1010     = _mm_set1_epi32(0x1010),
v000F000F = _mm_set1_epi32(0x000F000F),
v0F000F00 = _mm_set1_epi32(0x0F000F00),
v0011 = _mm_set1_epi32(0x0011);
__m128i       u, v, w, x;
do {
v = _mm_unpackhi_epi16(u, zero);
u = _mm_unpacklo_epi16(u, zero);

// original source: ((((a & 0xF0F) * 0x101) & 0xF000F) + (((a & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
w = _mm_and_si128(u, v0F0F);
x = _mm_and_si128(v, v0F0F);
u = _mm_and_si128(u, vF0F0);
v = _mm_and_si128(v, vF0F0);
w = _mm_mullo_epi32(w, v0101); // _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2
x = _mm_mullo_epi32(x, v0101);
u = _mm_mullo_epi32(u, v1010);
v = _mm_mullo_epi32(v, v1010);
w = _mm_and_si128(w, v000F000F);
x = _mm_and_si128(x, v000F000F);
u = _mm_and_si128(u, v0F000F00);
v = _mm_and_si128(v, v0F000F00);
u = _mm_mullo_epi32(u, v0011);
v = _mm_mullo_epi32(v, v0011);

// write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}

int main() {
unsigned char *const indat   = new unsigned char[DATA_SIZE_IN ],
*const outdat0 = new unsigned char[DATA_SIZE_OUT],
*const outdat1 = new unsigned char[DATA_SIZE_OUT],
*      curout  = outdat0,
*      lastout = outdat1,
*      place;
unsigned             start,
stop;

place = indat + DATA_SIZE_IN - 1;
do {
*place = (unsigned char)rand();
} while (place-- != indat);
MakeLutLo();
MakeLutHi();
MakeLutLarge();

for (unsigned testcount = 0; testcount < 1000; ++testcount) {
// Solution posted by the asker
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandOrig(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandOrig:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);

// Dmitry's small lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSmallLUT:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// Dmitry's small lookup table solution using only one lookup table
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupSmallOneLUT:\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// Large lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupLarge:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// AShelly's Interleave bits by Binary Magic Numbers solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShelly(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShelly:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellyMulOp:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// My SSE4 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// My SSE4 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// My SSE2 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// My SSE2 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;

// AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellySSE4:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;

std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
}

delete[] indat;
delete[] outdat0;
delete[] outdat1;
return 0;
}
``````

Unpacking a Python generator into arguments - memory efficient?

By : tihomirski
Date : March 29 2020, 07:55 AM
I think the issue was by ths following , Python expands a generator first when applying it as arguments; all values produced by the generator are loaded into memory before the call takes place, in both options.
You could use a reduce() function call instead:
code :
``````from functools import reduce  # Python 3 forward compatibility

reduce(set.union, f(5))
``````
``````>>> def f(n):
...     for i in xrange(n):
...         yield set(xrange(i))
...
>>> reduce(set.union, f(5))
set([0, 1, 2, 3])
``````

Most efficient way of unpacking cell array with one nested level

By : Bayan Hasan
Date : March 29 2020, 07:55 AM
this one helps. Use cat(1,data{:}) which uses the comma separated list data{:} to unpack the cell and cat to concatenate.

Is `*(volatile T*)0x1234;` guaranteed to translate into read instruction?

By : Alex Martinez
Date : March 29 2020, 07:55 AM
this one helps. When working with hardware it is sometimes required to perform a read from a specific register discarding the actual value (to clear some flags, for example). One way would be to explicitly read and discard the value such as: , C 2018 6.7.3 8 says:

Elegant way for space efficient unpacking of a vector of pairs in two vectors

By : SQL_bunny
Date : March 29 2020, 07:55 AM
this one helps. I have a big vector of std::pair elements and I want to unpack it in two vectors using small extra memory overhead (I don't want the memory space occupation doubled, i.e. erasing the vector of pairs after the unpacking) and possibly as fast as possible. The following solution is unacceptably slow: , Try this: