$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r76302 - in branches/release/libs/locale: . src src/encoding test
From: artyomtnk_at_[hidden]
Date: 2012-01-04 05:58:36
Author: artyom
Date: 2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
New Revision: 76302
URL: http://svn.boost.org/trac/boost/changeset/76302
Log:
Merged changesets 75594,75601 from trunk, incorrect used of 
MultiByteToWide in detection of invalid sequences.
Properties modified: 
   branches/release/libs/locale/   (props changed)
   branches/release/libs/locale/src/   (props changed)
Text files modified: 
   branches/release/libs/locale/src/encoding/wconv_codepage.ipp |    87 ++++++++++++++++++++++++++++++--------- 
   branches/release/libs/locale/test/test_codepage.cpp          |    47 +++++++++++++++++++++                   
   2 files changed, 113 insertions(+), 21 deletions(-)
Modified: branches/release/libs/locale/src/encoding/wconv_codepage.ipp
==============================================================================
--- branches/release/libs/locale/src/encoding/wconv_codepage.ipp	(original)
+++ branches/release/libs/locale/src/encoding/wconv_codepage.ipp	2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
@@ -86,12 +86,6 @@
         { "windows932",         932, 0 },
     };
 
-    size_t remove_substitutions(std::vector<wchar_t> &v)
-    {
-        v.erase(std::remove(v.begin(), v.end(), wchar_t(0xFFFD)), v.end());
-        return v.size();
-    }
-
     size_t remove_substitutions(std::vector<char> &v)
     {
         if(std::find(v.begin(),v.end(),0) == v.end()) {
@@ -107,23 +101,39 @@
         return v.size();
     }
 
+    void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
+    {
+        buf.reserve(end-begin);
+        while(begin!=end) {
+            wchar_t wide_buf[4];
+            int n = 0;
+            int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
+            if(len == 2 && begin+1==end)
+                return;
+            n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
+            for(int i=0;i<n;i++) 
+                buf.push_back(wide_buf[i]);
+            begin+=len;
+        }
+    }
+
     
     void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
     {
         if(begin==end)
             return;
-        DWORD flags = do_skip ? 0 : MB_ERR_INVALID_CHARS;
-        if(50220 <= codepage && codepage <= 50229)
-            flags = 0;
-        
-        int n = MultiByteToWideChar(codepage,flags,begin,end-begin,0,0);
-        if(n == 0)
+        int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
+        if(n == 0) {
+            if(do_skip) {
+                multibyte_to_wide_one_by_one(codepage,begin,end,buf);
+                return;
+            }
             throw conversion_error();
+        }
+
         buf.resize(n,0);
-        if(MultiByteToWideChar(codepage,flags,begin,end-begin,&buf.front(),buf.size())==0)
+        if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
             throw conversion_error();
-        if(do_skip)
-            remove_substitutions(buf);
     }
 
     void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
@@ -256,13 +266,37 @@
         }
         virtual std::string convert(char const *begin,char const *end)
         {
+            if(to_code_page_ == 65001 && from_code_page_ == 65001)
+                return utf_to_utf<char>(begin,end,how_);
+
             std::string res;
-            std::vector<wchar_t> tmp;
-            multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
-            if(tmp.empty())
-                return res;
+            
+            std::vector<wchar_t> tmp;   // buffer for mb2w
+            std::wstring tmps;          // buffer for utf_to_utf
+            wchar_t const *wbegin=0;
+            wchar_t const *wend=0;
+            
+            if(from_code_page_ == 65001) {
+                tmps = utf_to_utf<wchar_t>(begin,end,how_);
+                if(tmps.empty())
+                    return res;
+                wbegin = tmps.c_str();
+                wend = wbegin + tmps.size();
+            }
+            else {
+                multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
+                if(tmp.empty())
+                    return res;
+                wbegin = &tmp[0];
+                wend = wbegin + tmp.size();
+            }
+            
+            if(to_code_page_ == 65001) {
+                return utf_to_utf<char>(wbegin,wend,how_);
+            }
+
             std::vector<char> ctmp;
-            wide_to_multibyte(to_code_page_,&tmp.front(),&tmp.front()+tmp.size(),how_ == skip,ctmp);
+            wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
             if(ctmp.empty())
                 return res;
             res.assign(&ctmp.front(),ctmp.size());
@@ -328,6 +362,9 @@
 
         virtual string_type convert(char const *begin,char const *end) 
         {
+            if(code_page_ == 65001) {
+                return utf_to_utf<char_type>(begin,end,how_);
+            }
             std::vector<wchar_t> tmp;
             multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
             string_type res;
@@ -363,6 +400,9 @@
 
         virtual std::string convert(CharType const *begin,CharType const *end) 
         {
+            if(code_page_ == 65001) {
+                return utf_to_utf<char>(begin,end,how_);
+            }
             wchar_t const *wbegin = 0;
             wchar_t const *wend = 0;
             std::vector<wchar_t> buffer; // if needed
@@ -424,9 +464,11 @@
 
         virtual string_type convert(char const *begin,char const *end) 
         {
+            if(code_page_ == 65001) {
+                return utf_to_utf<char_type>(begin,end,how_);
+            }
             std::vector<wchar_t> buf;
             multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
-            remove_substitutions(buf);
 
             if(buf.empty())
                 return string_type();
@@ -460,6 +502,9 @@
 
         virtual std::string convert(CharType const *begin,CharType const *end) 
         {
+            if(code_page_ == 65001) {
+                return utf_to_utf<char>(begin,end,how_);
+            }
             std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
 
             std::vector<char> ctmp;
Modified: branches/release/libs/locale/test/test_codepage.cpp
==============================================================================
--- branches/release/libs/locale/test/test_codepage.cpp	(original)
+++ branches/release/libs/locale/test/test_codepage.cpp	2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
@@ -333,6 +333,51 @@
 }
 
 
+void test_skip(char const *enc,char const *utf,char const *name,char const *opt=0)
+{
+    if(opt!=0) {
+        if(boost::locale::conv::to_utf<char>(enc,name) == opt) {
+            test_skip(enc,opt,name);
+            return;
+        }
+    }
+    TEST(boost::locale::conv::to_utf<char>(enc,name) == utf);
+    TEST(boost::locale::conv::to_utf<wchar_t>(enc,name) == boost::locale::conv::utf_to_utf<wchar_t>(utf));
+    #ifdef BOOST_HAS_CHAR16_T
+    TEST(boost::locale::conv::to_utf<char16_t>(enc,name) == boost::locale::conv::utf_to_utf<char16_t>(utf));
+    #endif
+    #ifdef BOOST_HAS_CHAR32_T
+    TEST(boost::locale::conv::to_utf<char32_t>(enc,name) == boost::locale::conv::utf_to_utf<char32_t>(utf));
+    #endif
+}
+
+void test_simple_conversions()
+{
+    namespace blc=boost::locale::conv;
+    std::cout << "- Testing correct invalid bytes skipping" << std::endl;
+    try {
+        std::cout << "-- ISO-8859-8" << std::endl;
+        test_skip("test \xE0\xE1\xFB-","test \xd7\x90\xd7\x91-","ISO-8859-8");
+        test_skip("\xFB","","ISO-8859-8");
+        test_skip("test \xE0\xE1\xFB","test \xd7\x90\xd7\x91","ISO-8859-8");
+        test_skip("\xFB-","-","ISO-8859-8");
+    }
+    catch(blc::invalid_charset_error const &) {
+        std::cout <<"--- not supported" << std::endl;
+    }
+    try {
+        std::cout << "-- cp932" << std::endl;
+        test_skip("test\xE0\xA0 \x83\xF8-","test\xe7\x87\xbf -","cp932","test\xe7\x87\xbf ");
+        test_skip("\x83\xF8","","cp932");
+        test_skip("test\xE0\xA0 \x83\xF8","test\xe7\x87\xbf ","cp932");
+        test_skip("\x83\xF8-","-","cp932","");
+    }
+    catch(blc::invalid_charset_error const &) {
+        std::cout <<"--- not supported" << std::endl;
+    }
+}
+
+
 int main()
 {
     try {
@@ -353,6 +398,8 @@
         #if !defined(BOOST_LOCALE_WITH_ICU) && !defined(BOOST_LOCALE_WITH_ICONV) && (defined(BOOST_WINDOWS) || defined(__CYGWIN__))
         test_iso_8859_8 = IsValidCodePage(28598)!=0;
         #endif
+
+        test_simple_conversions();
         
         
         for(int type = 0; type < int(def.size()); type ++ ) {