$include_dir="/home/hyper-archives/boost/include"; include("$include_dir/msg-header.inc") ?>
Subject: [boost] [uuids] On boost::uuids::uuid operator == performance.
From: Michael Kochetkov (michael.kv_at_[hidden])
Date: 2012-04-15 19:10:04
Hello.
I would like to discuss a performance issue with boost::uuids::uuid operator
==. Let us consider the following program:
---------------------------------------------------
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
int
main() {
        boost::uuids::uuid id1((boost::uuids::random_generator()())),
                id2((boost::uuids::random_generator()()));
        return id1 == id2; // we will consider this line
}
---------------------------------------------------
That is compiled with maximum optimizations the following way with Microsoft
(R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86:
cl -EHsc -Ox -GL -Ob2 -Oi -Fa -I..\3dparty\boost_1_48_0 m051.cpp
boost uses the following comparison operator:
inline bool operator==(uuid const& lhs, uuid const& rhs) /* throw() */
{
    return std::equal(lhs.begin(), lhs.end(), rhs.begin());
}
which end up with the following code:
        lea	edx, DWORD PTR _id2$[esp+92]
        lea	ecx, DWORD PTR _id1$[esp+108]
        lea	eax, DWORD PTR _id1$[esp+92]
        call	?_Equal_at_std@@YA_NPBE00_at_Z		; std::_Equal
The whole story looks the following way (the code above plus std::_Equal
call expansion):
0117C6AB  lea         edx,[esp+24h]  
0117C6AF  lea         ecx,[esp+48h]  
0117C6B3  lea         eax,[esp+38h]  
0117C6B7  call        01171000  
01171000  push        esi  
01171001  mov         esi,eax  
01171003  sub         ecx,esi  
01171005  push        edi  
01171006  cmp         ecx,4  
01171009  jb          01171024  
0117100B  jmp         01171010  
0117100D  lea         ecx,[ecx]  
01171010  mov         eax,dword ptr [esi]  
01171012  cmp         eax,dword ptr [edx]  
01171014  jne         01171028  
01171016  sub         ecx,4  
01171019  add         edx,4  
0117101C  add         esi,4  
0117101F  cmp         ecx,4  
01171022  jae         01171010  
01171024  test        ecx,ecx  
01171026  je          01171073  
01171028  movzx       eax,byte ptr [esi]  
0117102B  movzx       edi,byte ptr [edx]  
0117102E  sub         eax,edi  
01171030  jne         01171063  
01171032  cmp         ecx,1  
01171035  jbe         01171073  
01171037  movzx       eax,byte ptr [esi+1]  
0117103B  movzx       edi,byte ptr [edx+1]  
0117103F  sub         eax,edi  
01171041  jne         01171063  
01171043  cmp         ecx,2  
01171046  jbe         01171073  
01171048  movzx       eax,byte ptr [esi+2]  
0117104C  movzx       edi,byte ptr [edx+2]  
01171050  sub         eax,edi  
01171052  jne         01171063  
01171054  cmp         ecx,3  
01171057  jbe         01171073  
01171059  movzx       eax,byte ptr [esi+3]  
0117105D  movzx       ecx,byte ptr [edx+3]  
01171061  sub         eax,ecx  
01171063  sar         eax,1Fh  
01171066  or          eax,1  
01171069  xor         edx,edx  
0117106B  test        eax,eax  
0117106D  pop         edi  
0117106E  sete        al  
01171071  pop         esi  
01171072  ret  
01171073  xor         eax,eax  
01171075  xor         edx,edx  
01171077  test        eax,eax  
01171079  pop         edi  
0117107A  sete        al  
0117107D  pop         esi  
0117107E  ret  
I do understand that the boost::uuids::uuid implementation is conceptually
correct and inoculate the best programming practices. But the following
equivalent is too much faster (you may probably want to ensure a proper
alignment for some processor architectures or even get even faster 64-bit
version):
inline
bool comp(const boost::uuids::uuid& l, const boost::uuids::uuid& r) {
        return *reinterpret_cast<const uint32_t*>(l.data) ==
*reinterpret_cast<const uint32_t*>(r.data) 
                && *reinterpret_cast<const uint32_t*>(l.data+4) ==
*reinterpret_cast<const uint32_t*>(r.data+4)
                && *reinterpret_cast<const uint32_t*>(l.data+8) ==
*reinterpret_cast<const uint32_t*>(r.data+8)
                && *reinterpret_cast<const uint32_t*>(l.data+12) ==
*reinterpret_cast<const uint32_t*>(r.data+12);
}
the assembler output is:
        mov	ecx, DWORD PTR _id1$[esp+92]
        cmp	ecx, DWORD PTR _id2$[esp+92]
        jne	SHORT $LN37_at_main
        mov	edx, DWORD PTR _id1$[esp+96]
        cmp	edx, DWORD PTR _id2$[esp+96]
        jne	SHORT $LN37_at_main
        mov	eax, DWORD PTR _id1$[esp+100]
        cmp	eax, DWORD PTR _id2$[esp+100]
        jne	SHORT $LN37_at_main
        mov	ecx, DWORD PTR _id1$[esp+104]
        cmp	ecx, DWORD PTR _id2$[esp+104]
        jne	SHORT $LN37_at_main
        mov	eax, 1
        jmp	SHORT $LN40_at_main
$LN37_at_main:
        xor	eax, eax
$LN40_at_main:
        movzx	eax, al
that the question comes: is boost supposed to be used in commercial
applications or it is just a testing area for some concepts?
You have probably never thought of the fact that Debug (unoptimized
configurations) are sometimes just unusable with such approach on real (big)
data.
If you are interested you may want to compile the program above this way and
see the resulting code:
cl -EHsc -Zi -Fa -I..\3dparty\boost_1_48_0 m051.cpp
Thank you in advance for your considerations.
-- Michael Kochetkov