$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r84204 - in trunk/libs/log: build src test/run
From: andrey.semashev_at_[hidden]
Date: 2013-05-09 12:53:03
Author: andysem
Date: 2013-05-09 12:53:02 EDT (Thu, 09 May 2013)
New Revision: 84204
URL: http://svn.boost.org/trac/boost/changeset/84204
Log:
Implemented AVX2 version of dump.
Text files modified: 
   trunk/libs/log/build/Jamfile.v2             |     4                                         
   trunk/libs/log/src/dump_avx2.cpp            |   174 +++++++++++++++++++++++++-------------- 
   trunk/libs/log/test/run/util_manip_dump.cpp |    25 +++++                                   
   3 files changed, 136 insertions(+), 67 deletions(-)
Modified: trunk/libs/log/build/Jamfile.v2
==============================================================================
--- trunk/libs/log/build/Jamfile.v2	(original)
+++ trunk/libs/log/build/Jamfile.v2	2013-05-09 12:53:02 EDT (Thu, 09 May 2013)
@@ -286,7 +286,7 @@
 
             if ! [ version.version-less $(version) : 4 7 ]
             {
-                result = <cxxflags>"-march=core-avx2 -mavx -mavx2" ;
+                result = <cxxflags>"-march=core-avx2 -mavx -mavx2 -fabi-version=0" ;
             }
         }
         else if <toolset>msvc in $(properties)
@@ -325,7 +325,7 @@
                 }
                 else
                 {
-                    result = <cxxflags>"-march=core-avx2 -mavx -mavx2" ;
+                    result = <cxxflags>"-march=core-avx2 -mavx -mavx2 -fabi-version=0" ;
                 }
             }
         }
Modified: trunk/libs/log/src/dump_avx2.cpp
==============================================================================
--- trunk/libs/log/src/dump_avx2.cpp	(original)
+++ trunk/libs/log/src/dump_avx2.cpp	2013-05-09 12:53:02 EDT (Thu, 09 May 2013)
@@ -40,87 +40,132 @@
 enum
 {
     packs_per_stride = 32,
-    stride = packs_per_stride * 16
+    stride = packs_per_stride * 32
 };
 
-union xmm_constant
+union ymm_constant
 {
-    uint8_t as_bytes[16];
-    __m128i as_mm;
+    uint8_t as_bytes[32];
+    __m256i as_mm;
 };
 
-static const xmm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }};
-static const xmm_constant mm_9 = {{ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 }};
-static const xmm_constant mm_char_0 = {{ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' }};
-static const xmm_constant mm_char_space_mask = {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }};
-static const xmm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }};
-static const xmm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }};
-static const xmm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
+static const ymm_constant mm_char_space_mask =  {{ ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ',             ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ', 0, 0, ' ' }};
+static const ymm_constant mm_shuffle_pattern1 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80,       0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }};
+static const ymm_constant mm_shuffle_pattern2 = {{ 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10,         0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }};
+static const ymm_constant mm_shuffle_pattern3 = {{ 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15,    5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
+static const ymm_constant mm_shuffle_pattern13 = {{ 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80,      5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
 
 //! Dumps a pack of input data into a string of 8 bit ASCII characters
-static BOOST_LOG_FORCEINLINE void dump_pack(__m128i mm_char_10_to_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3)
+static BOOST_LOG_FORCEINLINE void dump_pack(__m256i mm_char_a, __m256i mm_input, __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3)
 {
     // Split half-bytes
-    __m128i mm_input_hi = _mm_and_si128(_mm_srli_epi16(mm_input, 4), mm_15.as_mm);
-    __m128i mm_input_lo = _mm_and_si128(mm_input, mm_15.as_mm);
+    const __m256i mm_15 = _mm256_set1_epi8(0x0F);
+    __m256i mm_input_hi = _mm256_and_si256(_mm256_srli_epi16(mm_input, 4), mm_15);
+    __m256i mm_input_lo = _mm256_and_si256(mm_input, mm_15);
 
     // Stringize each of the halves
-    __m128i mm_addend_hi = _mm_cmpgt_epi8(mm_input_hi, mm_9.as_mm);
-    __m128i mm_addend_lo = _mm_cmpgt_epi8(mm_input_lo, mm_9.as_mm);
-    mm_addend_hi = _mm_and_si128(mm_char_10_to_a, mm_addend_hi);
-    mm_addend_lo = _mm_and_si128(mm_char_10_to_a, mm_addend_lo);
+    const __m256i mm_9 = _mm256_set1_epi8(9);
+    __m256i mm_addend_hi = _mm256_cmpgt_epi8(mm_input_hi, mm_9);
+    __m256i mm_addend_lo = _mm256_cmpgt_epi8(mm_input_lo, mm_9);
+    const __m256i mm_char_0 = _mm256_set1_epi8('0');
+    mm_addend_hi = _mm256_blendv_epi8(mm_char_0, mm_char_a, mm_addend_hi);
+    mm_addend_lo = _mm256_blendv_epi8(mm_char_0, mm_char_a, mm_addend_lo);
 
-    mm_input_hi = _mm_add_epi8(mm_input_hi, mm_char_0.as_mm);
-    mm_input_lo = _mm_add_epi8(mm_input_lo, mm_char_0.as_mm);
-
-    mm_input_hi = _mm_add_epi8(mm_input_hi, mm_addend_hi);
-    mm_input_lo = _mm_add_epi8(mm_input_lo, mm_addend_lo);
+    mm_input_hi = _mm256_add_epi8(mm_input_hi, mm_addend_hi);
+    mm_input_lo = _mm256_add_epi8(mm_input_lo, mm_addend_lo);
 
     // Join them back together
-    __m128i mm_1 = _mm_unpacklo_epi8(mm_input_hi, mm_input_lo);
-    __m128i mm_2 = _mm_unpackhi_epi8(mm_input_hi, mm_input_lo);
+    __m256i mm_1 = _mm256_unpacklo_epi8(mm_input_hi, mm_input_lo);
+    __m256i mm_2 = _mm256_unpackhi_epi8(mm_input_hi, mm_input_lo);
 
     // Insert spaces between stringized bytes:
     // |0123456789abcdef|0123456789abcdef|
     // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef|
-    mm_output1 = _mm_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm);
-    mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm);
-    mm_output3 = _mm_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm);
+    __m256i mm_out1 = _mm256_shuffle_epi8(mm_1, mm_shuffle_pattern1.as_mm);
+    __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), mm_shuffle_pattern2.as_mm);
+    __m256i mm_out3 = _mm256_shuffle_epi8(mm_2, mm_shuffle_pattern3.as_mm);
+
+    __m256i mm_char_space = mm_char_space_mask.as_mm;
+    mm_out1 = _mm256_or_si256(mm_out1, mm_char_space);
+    mm_char_space = _mm256_srli_si256(mm_char_space, 1);
+    mm_out2 = _mm256_or_si256(mm_out2, mm_char_space);
+    mm_char_space = _mm256_srli_si256(mm_char_space, 1);
+    mm_out3 = _mm256_or_si256(mm_out3, mm_char_space);
+
+    mm_output1 = _mm256_permute2x128_si256(mm_out1, mm_out2, 2u << 4 | 0u);
+    mm_output2 = _mm256_permute2x128_si256(mm_out3, mm_out1, 3u << 4 | 0u);
+    mm_output3 = _mm256_permute2x128_si256(mm_out2, mm_out3, 3u << 4 | 1u);
+}
+
+//! Dumps a pack of input data into a string of 8 bit ASCII characters
+static BOOST_LOG_FORCEINLINE void dump_pack(__m256i mm_char_a, __m128i mm_input, __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3)
+{
+    // Split half-bytes
+    __m128i mm_input_hi = _mm_srli_epi16(mm_input, 4);
+    __m256i mm = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi8(mm_input_hi, mm_input)), _mm_unpackhi_epi8(mm_input_hi, mm_input), 1);
+    mm = _mm256_and_si256(mm, _mm256_set1_epi8(0x0F));
+
+    // Stringize the halves
+    __m256i mm_addend = _mm256_cmpgt_epi8(mm, _mm256_set1_epi8(9));
+    mm_addend = _mm256_blendv_epi8(_mm256_set1_epi8('0'), mm_char_a, mm_addend);
+    mm = _mm256_add_epi8(mm, mm_addend);
+
+    // Insert spaces between stringized bytes:
+    __m256i mm_out13 = _mm256_shuffle_epi8(mm, mm_shuffle_pattern13.as_mm);
+    __m128i mm_out2 = _mm_shuffle_epi8(_mm_alignr_epi8(_mm256_extractf128_si256(mm, 1), _mm256_castsi256_si128(mm), 10), _mm256_castsi256_si128(mm_shuffle_pattern2.as_mm));
 
-    __m128i mm_char_space = mm_char_space_mask.as_mm;
-    mm_output1 = _mm_or_si128(mm_output1, mm_char_space);
+    __m128i mm_char_space = _mm256_castsi256_si128(mm_char_space_mask.as_mm);
+    mm_output1 = _mm_or_si128(_mm256_castsi256_si128(mm_out13), mm_char_space);
     mm_char_space = _mm_srli_si128(mm_char_space, 1);
-    mm_output2 = _mm_or_si128(mm_output2, mm_char_space);
+    mm_output2 = _mm_or_si128(mm_out2, mm_char_space);
     mm_char_space = _mm_srli_si128(mm_char_space, 1);
-    mm_output3 = _mm_or_si128(mm_output3, mm_char_space);
+    mm_output3 = _mm_or_si128(_mm256_extractf128_si256(mm_out13, 1), mm_char_space);
 }
 
 template< typename CharT >
-BOOST_LOG_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf)
+BOOST_LOG_FORCEINLINE void store_characters(__m256i mm_chars, CharT* buf)
 {
     switch (sizeof(CharT))
     {
     case 1:
-        _mm_store_si128(reinterpret_cast< __m128i* >(buf), mm_chars);
+        _mm256_store_si256(reinterpret_cast< __m256i* >(buf), mm_chars);
         break;
 
     case 2:
+        _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi16(_mm256_castsi256_si128(mm_chars)));
+        _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi16(_mm256_extractf128_si256(mm_chars, 1)));
+        break;
+
+    case 4:
         {
-            __m128i mm_0 = _mm_setzero_si128();
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi8(mm_chars, mm_0));
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi8(mm_chars, mm_0));
+            __m256i mm = _mm256_unpackhi_epi64(mm_chars, mm_chars);
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(mm_chars)));
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi32(_mm256_castsi256_si128(mm)));
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 2, _mm256_cvtepu8_epi32(_mm256_extractf128_si256(mm_chars, 1)));
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 3, _mm256_cvtepu8_epi32(_mm256_extractf128_si256(mm, 1)));
         }
         break;
+    }
+}
+
+template< typename CharT >
+BOOST_LOG_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf)
+{
+    switch (sizeof(CharT))
+    {
+    case 1:
+        _mm_store_si128(reinterpret_cast< __m128i* >(buf), mm_chars);
+        break;
+
+    case 2:
+        _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi16(mm_chars));
+        break;
 
     case 4:
         {
-            __m128i mm_0 = _mm_setzero_si128();
-            __m128i mm = _mm_unpacklo_epi8(mm_chars, mm_0);
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf), _mm_unpacklo_epi16(mm, mm_0));
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 1, _mm_unpackhi_epi16(mm, mm_0));
-            mm = _mm_unpackhi_epi8(mm_chars, mm_0);
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 2, _mm_unpacklo_epi16(mm, mm_0));
-            _mm_store_si128(reinterpret_cast< __m128i* >(buf) + 3, _mm_unpackhi_epi16(mm, mm_0));
+            __m128i mm = _mm_unpackhi_epi64(mm_chars, mm_chars);
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf), _mm256_cvtepu8_epi32(mm_chars));
+            _mm256_store_si256(reinterpret_cast< __m256i* >(buf) + 1, _mm256_cvtepu8_epi32(mm));
         }
         break;
     }
@@ -131,29 +176,26 @@
 {
     typedef CharT char_type;
 
-    char_type buf_storage[stride * 3u + 16u];
-    // Align the temporary buffer at 16 bytes
-    char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (16u - (((uintptr_t)(char_type*)buf_storage) & 15u)));
+    char_type buf_storage[stride * 3u + 32u];
+    // Align the temporary buffer at 32 bytes
+    char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u)));
     char_type* buf_begin = buf + 1u; // skip the first space of the first chunk
     char_type* buf_end = buf + stride * 3u;
 
-    __m128i mm_char_10_to_a;
-    if (strm.flags() & std::ios_base::uppercase)
-        mm_char_10_to_a = _mm_set1_epi32(0x07070707); // '9' is 0x39 and 'A' is 0x41 in ASCII, so we have to add 0x07 to 0x3A to get uppercase letters
-    else
-        mm_char_10_to_a = _mm_set1_epi32(0x27272727); // ...and 'a' is 0x61, which means we have to add 0x27 to 0x3A to get lowercase letters
+    const __m256i mm_char_a = _mm256_set1_epi8(((strm.flags() & std::ios_base::uppercase) ? 'A' : 'a') - 10);
 
     // First, check the input alignment
     const uint8_t* p = static_cast< const uint8_t* >(data);
-    if (const std::size_t prealign_size = ((16u - ((uintptr_t)p & 15u)) & 15u))
+    if (const std::size_t prealign_size = ((32u - ((uintptr_t)p & 31u)) & 31u))
     {
-        __m128i mm_input = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p));
-        __m128i mm_output1, mm_output2, mm_output3;
-        dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
+        __m256i mm_input = _mm256_lddqu_si256(reinterpret_cast< const __m256i* >(p));
+        __m256i mm_output1, mm_output2, mm_output3;
+        dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3);
         store_characters(mm_output1, buf);
-        store_characters(mm_output2, buf + 16u);
-        store_characters(mm_output3, buf + 32u);
+        store_characters(mm_output2, buf + 32u);
+        store_characters(mm_output3, buf + 64u);
 
+        _mm256_zeroupper();
         strm.write(buf_begin, prealign_size * 3u - 1u);
         buf_begin = buf;
         size -= prealign_size;
@@ -165,16 +207,17 @@
     for (std::size_t i = 0; i < stride_count; ++i)
     {
         char_type* b = buf;
-        for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 16u, p += 16u)
+        for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u)
         {
-            __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p));
-            __m128i mm_output1, mm_output2, mm_output3;
-            dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
+            __m256i mm_input = _mm256_load_si256(reinterpret_cast< const __m256i* >(p));
+            __m256i mm_output1, mm_output2, mm_output3;
+            dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3);
             store_characters(mm_output1, b);
-            store_characters(mm_output2, b + 16u);
-            store_characters(mm_output3, b + 32u);
+            store_characters(mm_output2, b + 32u);
+            store_characters(mm_output3, b + 64u);
         }
 
+        _mm256_zeroupper();
         strm.write(buf_begin, buf_end - buf_begin);
         buf_begin = buf;
     }
@@ -186,7 +229,7 @@
         {
             __m128i mm_input = _mm_load_si128(reinterpret_cast< const __m128i* >(p));
             __m128i mm_output1, mm_output2, mm_output3;
-            dump_pack(mm_char_10_to_a, mm_input, mm_output1, mm_output2, mm_output3);
+            dump_pack(mm_char_a, mm_input, mm_output1, mm_output2, mm_output3);
             store_characters(mm_output1, b);
             store_characters(mm_output2, b + 16u);
             store_characters(mm_output3, b + 32u);
@@ -204,6 +247,7 @@
             b[2] = static_cast< char_type >(char_table[n & 0x0F]);
         }
 
+        _mm256_zeroupper();
         strm.write(buf_begin, b - buf_begin);
     }
 }
Modified: trunk/libs/log/test/run/util_manip_dump.cpp
==============================================================================
--- trunk/libs/log/test/run/util_manip_dump.cpp	(original)
+++ trunk/libs/log/test/run/util_manip_dump.cpp	2013-05-09 12:53:02 EDT (Thu, 09 May 2013)
@@ -130,6 +130,31 @@
     BOOST_CHECK(equal_strings(strm_dump.str(), strm_correct.str()));
 }
 
+// Test SIMD tail handling
+BOOST_AUTO_TEST_CASE_TEMPLATE(unbounded_binary_tail_dump, CharT, char_types)
+{
+    typedef CharT char_type;
+    typedef std::basic_string< char_type > string_type;
+    typedef std::basic_ostringstream< char_type > ostream_type;
+
+    std::vector< unsigned char > data;
+    ostream_type strm_correct;
+    // 1023 makes it very unlikely for the buffer to end at 16 or 32 byte boundary, which makes the dump algorithm to process the tail in a special way
+    for (unsigned int i = 0; i < 1023; ++i)
+    {
+        unsigned char n = static_cast< unsigned char >(i);
+        data.push_back(n);
+        if (i > 0)
+            strm_correct << " ";
+        strm_correct << std::hex << std::setw(2) << std::setfill(static_cast< char_type >('0')) << static_cast< unsigned int >(n);
+    }
+
+    ostream_type strm_dump;
+    strm_dump << logging::dump(&data[0], data.size());
+
+    BOOST_CHECK(equal_strings(strm_dump.str(), strm_correct.str()));
+}
+
 // Test bounded dump
 BOOST_AUTO_TEST_CASE_TEMPLATE(bounded_binary_dump, CharT, char_types)
 {