$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r64688 - in sandbox/SOC/2009/unicode: boost/iterator boost/unicode libs/unicode/doc libs/unicode/example libs/unicode/test/iterator libs/unicode/test/unicode
From: loufoque_at_[hidden]
Date: 2010-08-08 20:58:53
Author: mgaunard
Date: 2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
New Revision: 64688
URL: http://svn.boost.org/trac/boost/changeset/64688
Log:
base64 example and make codecvt actually use boundary checkers
Added:
   sandbox/SOC/2009/unicode/libs/unicode/example/base64.cpp   (contents, props changed)
Text files modified: 
   sandbox/SOC/2009/unicode/boost/iterator/codecvt_converter.hpp        |    11 +                                       
   sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt.hpp        |   256 +++++++++++++++++++++++++++------------ 
   sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp                |    29 ++++                                    
   sandbox/SOC/2009/unicode/libs/unicode/doc/autodoc1c.xml              |     2                                         
   sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk           |     7                                         
   sandbox/SOC/2009/unicode/libs/unicode/example/Jamfile.v2             |     1                                         
   sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp |    49 ++++++-                                 
   sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_locale.cpp   |    23 ++-                                     
   8 files changed, 272 insertions(+), 106 deletions(-)
Modified: sandbox/SOC/2009/unicode/boost/iterator/codecvt_converter.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/codecvt_converter.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/iterator/codecvt_converter.hpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -37,6 +37,8 @@
     template<typename In, typename Out>
     Out ltr(In& begin, In end, Out out)
     {
+        In old_begin = begin;
+        
         std::mbstate_t state;
         memset(&state, 0, sizeof state);
         
@@ -67,11 +69,9 @@
                 const_cast<Input*>(from_next)[i] = *begin++;
             }
             
-            if(to_next - buffer_out)
+            if(to_next - buffer_out || begin == end)
                 break;
-
-            if(begin == end)
-                throw std::out_of_range("unexpected end");            
+                
             *const_cast<Input*>(from_next) = *begin++;
         }
         
@@ -94,6 +94,9 @@
         }
         while(to_next != old_to_next);
     
+        // restore begin to the position given by the final 'from_next'
+        std::advance(old_begin, from_next - buffer_in);
+        begin = old_begin;
         return std::copy(buffer_out, to_next, out);
     }
     
Modified: sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/iterator/converter_codecvt.hpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -11,12 +11,34 @@
 #include <algorithm>
 
 #include <map>
-#include <boost/range/algorithm.hpp>
 #include <boost/range/join.hpp>
+#include <boost/circular_buffer.hpp>
 
 namespace boost
 {
 
+namespace detail
+{
+    template<typename A, typename B, typename T>
+    struct other;
+    
+    template<typename A>
+    struct other<A, A, A>
+    {
+        typedef A type;
+    };
+    template<typename A, typename B>
+    struct other<A, B, A>
+    {
+        typedef B type;
+    };
+    template<typename A, typename B>
+    struct other<A, B, B>
+    {
+        typedef A type;
+    };
+}
+
 /** Builds a codecvt facet from two \c \xmlonly<conceptname>Converter</conceptname>s\endxmlonly
  * and two \c \xmlonly<conceptname>BoundaryChecker</conceptname>s\endxmlonly.
  * When writing to a file, \c P1 is applied for segments of data on which \c B1 is true at the beginning and at the end.
@@ -36,27 +58,132 @@
     BOOST_CONCEPT_ASSERT((Convertible<InternT, typename P1::input_type>));
     BOOST_CONCEPT_ASSERT((Convertible<typename P2::output_type, InternT>));
     
-    explicit converter_codecvt(const B1& b1_ = B1(), const P1& p1_ = P1(), const B2& b2_ = B2(), const P2& p2_ = P2(), std::size_t refs = 0)
-        : std::codecvt<intern_type, extern_type, state_type>(refs), b1(b1_), p1(p1_), b2(b2_), p2(p2_)
+    typedef typename std::basic_ios<InternT>::pos_type pos_type;
+    
+    explicit converter_codecvt(pos_type file_size_ = (pos_type)-1, const B1& b1_ = B1(), const P1& p1_ = P1(), const B2& b2_ = B2(), const P2& p2_ = P2(), std::size_t refs = 0)
+        : std::codecvt<intern_type, extern_type, state_type>(refs), file_size(file_size_), b1(b1_), p1(p1_), b2(b2_), p2(p2_)
     {
     }
     
 private:
+    pos_type file_size;
+
+    template<typename T>
     struct state_t
     {
-        intern_type pending_data[P2::max_output::value];
-        size_t pending_size;
+        boost::circular_buffer<T> pending_data;
+        pos_type read_size;
+        
+        // size of storage is maximum size of input, which is not exposed
+        // by Converters, so we just take an arbitrary max size
+        state_t() : pending_data(64), read_size(0)
+        {
+        }
+    };
+    
+    struct state_pair_t
+    {
+        state_t<extern_type> in;
+        state_t<intern_type> out;
+        
+#ifdef BOOST_MSVC
+        // MSVC only calls 'in' step-by-step, so that is enough storage
+        intern_type pending_write_data[P2::max_output::value];
+        size_t pending_write_index;
+        size_t pending_write_size;
+        
+        state_pair_t() : pending_write_index(0), pending_write_size(0)
+        {
+        }
+#endif
+
+    };
+    
+    template<typename T>
+    struct other : detail::other<intern_type, extern_type, T>
+    {
     };
-    mutable std::map<state_type*, state_t> states;
+    
+    mutable std::map<state_type*, state_pair_t> states;
     
     mutable B1 b1;
     mutable P1 p1;
     
     mutable B2 b2;
     mutable P2 p2;
-    
-protected:
 
+    template<typename B, typename P, typename T>
+    std::codecvt_base::result do_(
+        B& b, P& p,
+        state_t<T>& st,
+        const T* from, const T* from_end, const T*& from_next,
+        typename other<T>::type* to, typename other<T>::type* to_end, typename other<T>::type*& to_next
+    ) const
+    {
+        typedef const boost::iterator_range<typename circular_buffer<T>::const_iterator> range_circular;
+        typedef const boost::iterator_range<const T*> range_base;
+        typedef boost::range_detail::join_iterator<typename circular_buffer<T>::const_iterator, const T*> iterator;
+        
+        from_next = from;
+        to_next = to;
+        
+        // our real input is the concatenated pending data and the given input
+        boost::joined_range<range_circular, range_base> input = boost::join(
+            range_circular(st.pending_data.begin(), st.pending_data.end()),
+            range_base(from, from_end)
+        );
+        
+        iterator from2 = input.begin();
+        iterator from_next2 = from2;
+        iterator from_end2 = input.end();
+        
+        // while we have some input
+        while(from_next2 != from_end2)
+        {
+            iterator from_boundary = from_next2;
+            do
+            {
+                ++from_boundary;
+            }
+            while(from_boundary != from_end2 && !b(from_next2, from_end2, from_boundary));
+         
+            bool eof = st.read_size + pos_type(from_end2 - from_next2) == file_size
+                    || from == from_end;
+            
+            // boundary not found and not end of file, we append the trailing data to 'pending'
+            if(from_boundary == from_end2 && !eof)
+            {
+                std::copy(from_next, from_end, std::back_inserter(st.pending_data));
+                from_next = from_end;
+                return std::codecvt_base::ok;
+            }
+            
+            size_t written = from_boundary - from_next2;
+            try
+            {
+                to_next = p.ltr(from_next2, from_boundary, to_next);
+            }
+            catch(...)
+            {
+                return std::codecvt_base::error;
+            }
+            
+            // erase the consumed pending data and update 'from_next'
+            st.read_size += written;
+            if(written >= st.pending_data.size())
+            {
+                from_next += written - st.pending_data.size();
+                st.pending_data.clear();
+            }
+            else
+            {
+                st.pending_data.erase_begin(written);
+            }
+        }
+        return std::codecvt_base::ok;
+    }
+
+protected:
     virtual std::codecvt_base::result do_in(
         state_type& state, 
         const extern_type* from,
@@ -67,34 +194,43 @@
         intern_type*& to_next
     ) const
     {
-        state_t& st = states[&state];
+#if BOOST_MSVC
+        state_pair_t& st = states[&state];
         
-        from_next = from;
         to_next = to;
+        from_next = from;
         
-        if(st.pending_size)
+        // MSVC only supports getting one result per call, so we use a hack
+        if(st.pending_write_size)
         {
-            *to_next++ = st.pending_data[0];
-            std::copy(st.pending_data + 1, st.pending_data + st.pending_size, st.pending_data);
-            st.pending_size--;
+            *to_next++ = st.pending_write_data[st.pending_write_index];
+            st.pending_write_index++;
+            st.pending_write_size--;
+            
+            if(!st.pending_write_size)
+                from_next++;
+            
             return std::codecvt_base::ok;
         }
         
-        if(from_next == from_end)
-            return std::codecvt_base::ok;
-            
-        try
-        {
-            st.pending_size = p2.ltr(from_next, from_end, st.pending_data) - st.pending_data;
-            *to_next++ = st.pending_data[0];
-            std::copy(st.pending_data + 1, st.pending_data + st.pending_size, st.pending_data);
-            st.pending_size--;
-        }
-        catch(...)
+        intern_type* to_next2;
+        std::codecvt_base::result result = do_(b2, p2, states[&state].in, from, from_end, from_next, st.pending_write_data, st.pending_write_data + st.pending_write_size, to_next2);
+        st.pending_write_size = to_next2 - st.pending_write_data;
+        
+        if(st.pending_write_size)
         {
-            return std::codecvt_base::partial;
+            *to_next++ = st.pending_write_data[0];
+            st.pending_write_index = 1;
+            st.pending_write_size--;
         }
-        return std::codecvt_base::ok;
+        
+        if(st.pending_write_size)
+            from_next--;
+        
+        return result;
+#else
+        return do_(b2, p2, states[&state].in, from, from_end, from_next, to, to_end, to_next);
+#endif
     }
 
     virtual std::codecvt_base::result do_out(
@@ -107,52 +243,7 @@
         extern_type*& to_next
     ) const
     {
-        typedef const boost::iterator_range<const intern_type*> range_base;
-        typedef boost::range_detail::join_iterator<const intern_type*, const intern_type*> iterator;
-        
-        state_t& st = states[&state];
-        
-        from_next = from;
-        to_next = to;
-        
-        boost::joined_range<range_base, range_base> input = boost::join(
-            range_base(st.pending_data,  st.pending_data + st.pending_size),
-            range_base(from, from_end)
-        );
-        
-        iterator from2 = input.begin();
-        iterator from_next2 = from2;
-        iterator from_end2 = input.end();
-        
-        while(from_next2 != from_end2)
-        {
-            try
-            {
-                to_next = p1.ltr(from_next2, from_end2, to_next);
-            }
-            catch(...)
-            {
-                size_t written = from_next2 - from2;
-                if(written >= st.pending_size)
-                {
-                    from_next += (from_next2 - from2) - st.pending_size;
-                    st.pending_size = 0;
-                }
-                
-                boost::copy(range_base(from_next, from_end), st.pending_data + st.pending_size);
-                st.pending_size += (from_end - from_next);
-                from_next = from_end;
-                return std::codecvt_base::ok;
-            }
-        }
-        
-        size_t written = from_next2 - from2;
-        if(written >= st.pending_size)
-        {
-            from_next += (from_next2 - from2) - st.pending_size;
-            st.pending_size = 0;
-        }
-        return std::codecvt_base::ok;
+        return do_(b1, p1, states[&state].out, from, from_end, from_next, to, to_end, to_next);
     }
 
     virtual bool do_always_noconv() const throw()
@@ -167,12 +258,15 @@
         extern_type*& to_next
     ) const 
     {
-        state_t& st = states[&state];
+        typedef typename circular_buffer<intern_type>::const_iterator iterator;
+        state_t<intern_type>& st = states[&state].out;
         
         to_next = to;
-        const intern_type* from = st.pending_data;
-        const intern_type* from_next = from;
-        const intern_type* from_end = st.pending_data + st.pending_size;
+        
+        // we output the pending 'out' data
+        iterator from = st.pending_data.begin();
+        iterator from_next = from;
+        iterator from_end = st.pending_data.end();
         
         while(from_next != from_end)
         {
@@ -185,8 +279,7 @@
                 return std::codecvt_base::error;
             }
         }
-        
-        st.pending_size = 0;
+        st.pending_data.clear();
         return std::codecvt_base::ok;
     }
 
@@ -195,6 +288,7 @@
         return 0;
     }
 
+    // probably needs fixing, but no implementation uses this
     virtual int do_length(
         state_type&,
         const extern_type* from,
@@ -219,7 +313,9 @@
 
     virtual int do_max_length() const throw ()
     {
-        return P1::max_output::value;
+        // maximum size of P2's input, which is not exposed
+        // by Converters, so we just take an arbitrary max size
+        return 64;
     }
 };
     
Modified: sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/unicode/utf_codecs.hpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -81,6 +81,29 @@
         boost::throw_exception(e);
 }
 
+    template<typename Iterator>
+    bool distance_greater_impl(Iterator begin, Iterator end, typename std::iterator_traits<Iterator>::difference_type min, std::random_access_iterator_tag*)
+    {
+        return (end - begin) >= min;
+    }
+    
+    template<typename Iterator>
+    bool distance_greater_impl(Iterator begin, Iterator end, typename std::iterator_traits<Iterator>::difference_type min, std::input_iterator_tag*)
+    {
+        for(typename std::iterator_traits<Iterator>::difference_type i=0; begin != end; ++begin, ++i)
+        {
+            if(i == min)
+                return true;
+        }
+        return false;
+    }
+
+    template<typename Iterator>
+    bool distance_greater(Iterator begin, Iterator end, typename std::iterator_traits<Iterator>::difference_type min)
+    {
+        return distance_greater_impl(begin, end, min, (typename std::iterator_traits<Iterator>::iterator_category*)0);
+    }
+
 } // namespace detail
 
 /** Model of \c \xmlonly<conceptname>OneManyConverter</conceptname>\endxmlonly
@@ -216,7 +239,8 @@
         BOOST_ASSERT(pos != begin);
         BOOST_ASSERT(pos != end);
         
-        return !is_surrogate(*pos) || is_high_surrogate(*pos);
+        return !is_surrogate(*pos)
+            || (is_high_surrogate(*pos) && detail::distance_greater(pos, end, 2));
     }
 };
 
@@ -399,7 +423,8 @@
         BOOST_ASSERT(pos != end);
         
         unsigned char c = *pos;
-        return (c & 0x80) == 0 || (c & 0xc0) == 0xc0;
+        return (c & 0x80) == 0
+            || ((c & 0xc0) == 0xc0 && detail::distance_greater(pos, end, detail::utf8_byte_count(c)));
     }
 };
 
Modified: sandbox/SOC/2009/unicode/libs/unicode/doc/autodoc1c.xml
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/doc/autodoc1c.xml	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/doc/autodoc1c.xml	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -1,5 +1,5 @@
 <?xml version="1.0" standalone="yes"?>
-<library-reference xmlns:xi="http://www.w3.org/2001/XInclude" id="iterator_range_reference">
+<library-reference xmlns:xi="http://www.w3.org/2001/XInclude" id="converters_and_segmenters_reference">
 <title>Iterator/Range reference</title>
 
 <xi:include href="concepts/Converter.xml"/>
Modified: sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/doc/users_manual.qbk	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -505,11 +505,16 @@
 Unfortunately, it appears it is only possible to use this mechanism with codecvt facets that have =char= as external and either
 =char= or =wchar_t= as internal, but C++0x may improve the situation.
 
-To use [classref boost::converter_codecvt], which allows to build a codecvt facet from converters, you will need two [conceptref Converter]s, one for each direction, as well as two [conceptref BoundaryChecker]s.
+To use [classref boost::converter_codecvt], which allows to build a codecvt facet from converters, you will need two [conceptref Converter]s, one
+for each direction, as well as two [conceptref BoundaryChecker]s.
 Indeed, as codecvt facets are passed arbitrary input buffers, there needs to be a way to tell what is the right boundaries to apply the steps on.
 An alternative would be to try to apply a step and try again if there was an error due to incomplete data. This is however not sufficient for
 converters that are not stable by concatenation.
 
+Unfortunately, codecvt facets do not provide a way to identify the end of the input in the file to memory case -- even though they do for the
+other way around -- and the generic mechanism used to build codecvt facets needs this.
+As a workaround, you can provide the size of the file to the codecvt facet's constructor.
+
 You may also build converters out of codecvt facets with [classref boost::codecvt_in_converter] or [classref boost::codecvt_out_converter], or
 directly convert locales to UTF-32 with [classref boost::unicode::locale_decoder] or [classref boost::unicode::locale_encoder].
 
Modified: sandbox/SOC/2009/unicode/libs/unicode/example/Jamfile.v2
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/example/Jamfile.v2	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/Jamfile.v2	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -25,4 +25,5 @@
     [ run compose.cpp ]
     [ run search.cpp ]
     [ run source_input.cpp ]
+    [ run base64.cpp ]
 ;
Added: sandbox/SOC/2009/unicode/libs/unicode/example/base64.cpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/base64.cpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -0,0 +1,115 @@
+#include <boost/iterator/convert_iterator.hpp>
+#include <boost/iterator/converter_codecvt.hpp>
+
+#include <boost/range/as_literal.hpp>
+
+#include <fstream>
+
+struct base64_encoder
+{
+    typedef char input_type;
+    typedef char output_type;
+    
+    typedef boost::mpl::int_<4> max_output;
+    
+    template<typename In, typename Out>
+    Out ltr(In& begin, In end, Out out)
+    {
+        const char * lookup_table = 
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "abcdefghijklmnopqrstuvwxyz"
+            "0123456789"
+            "+/";
+        
+        char in[3];
+        
+        in[0] = *begin++;
+        *out++ = lookup_table[in[0] >> 2];
+        if(begin == end)
+        {
+            *out++ = lookup_table[(in[0] & 0x03) << 4];
+            *out++ = '=';
+            *out++ = '=';
+            return out;
+        }
+        
+        in[1] = *begin++;
+        *out++ = lookup_table[((in[0] & 0x03) << 4) | (in[1] >> 4)];
+        if(begin == end)
+        {
+            *out++ = lookup_table[(in[1] & 0x0f) << 2];
+            *out++ = '=';
+            return out;
+        }
+        
+        in[2] = *begin++;
+        *out++ = lookup_table[((in[1] & 0x0f) << 2) | (in[2] >> 6)];
+        *out++ = lookup_table[in[2] & 0x3f];
+        return out;
+    }
+    
+    template<typename In, typename Out>
+    Out rtl(In begin, In& end, Out out)
+    {
+        size_t to_read = std::distance(begin, end) % 3;
+        if(!to_read)
+            to_read = 3;
+        
+        char in[3];
+        for(size_t i=0; i<to_read; i++)
+            in[to_read-i-1] = *--end;
+            
+        char* b = in;
+        return ltr(b, in+to_read, out);
+    }
+};
+
+template<std::size_t N>
+struct fixed_boundary
+{
+    typedef char input_type;
+    
+    template<typename In>
+    bool operator()(In begin, In end, In pos)
+    {
+        return !(std::distance(begin, pos) % N);
+    }
+};
+
+typedef boost::converter_codecvt<
+    char,
+    fixed_boundary<3>,
+    base64_encoder,
+    fixed_boundary<3>,
+    base64_encoder
+> base64_codecvt;
+
+#define CHECK_EQUAL(a, b) if(a != b) std::unexpected();
+
+int main()
+{
+    char data_in_[] = "fooba";
+    char data_out_[] = "Zm9vYmE=";
+    
+    boost::iterator_range<const char*> data_in = boost::as_literal(data_in_);
+    boost::iterator_range<const char*> data_out = boost::as_literal(data_out_);
+    
+    std::locale old_loc;
+    std::locale loc(old_loc, new base64_codecvt(boost::size(data_in)));
+    {
+        std::ofstream ofs("test.base64");
+        ofs << data_in;
+    }
+    
+    std::ifstream ifs("test.base64");
+    ifs.imbue(loc);
+    
+    char c;
+    size_t i = 0;
+    while(ifs.get(c))
+    {
+        CHECK_EQUAL(c, data_out[i]);
+        ++i;
+    }
+    CHECK_EQUAL(i, (size_t)boost::size(data_out));
+}
Modified: sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/test/iterator/test_codecvt.cpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -2,7 +2,8 @@
 /*`
 This test/example shows how to use a codecvt facet that transcodes from
 wide chars (UTF-16 or UTF-32) to UTF-8 on the way out, and that
-does the opposite on the way in, but normalizes the string as well.
+does the opposite on the way in.
+It also demonstrates a variant that normalizes data read from the file.
 */
 #define BOOST_TEST_MODULE Codecvt
 #include <boost/test/included/unit_test.hpp>
@@ -13,19 +14,48 @@
 #include <boost/range/algorithm.hpp>
 #include <boost/range/as_literal.hpp>
 
+// e\u0301 is \u00E9
+// \U0002FA1D is \U0002A600
+const wchar_t data_[] = L"hello e\u0301 \U0002FA1D world";
+boost::iterator_range<const wchar_t*> data = boost::as_literal(data_);
+    
+const wchar_t data_normalized_[] = L"hello \u00E9 \U0002A600 world";
+boost::iterator_range<const wchar_t*> data_normalized = boost::as_literal(data_normalized_);
 
 BOOST_AUTO_TEST_CASE( codecvt )
 {
-    // e\u0301 is \u00E9
-    // \U0002FA1D is \U0002A600
-    const wchar_t data_[] = L"hello e\u0301 \U0002FA1D world";
-    boost::iterator_range<const wchar_t*> data = boost::as_literal(data_);
-    
-    const wchar_t data_normalized_[] = L"hello \u00E9 \U0002A600 world";
-    boost::iterator_range<const wchar_t*> data_normalized = data;//boost::as_literal(data_normalized_);
+    std::locale old_locale;
+    std::locale utf8_locale(old_locale, new boost::unicode::utf_u8_codecvt(20));
+
+    // Set a new global locale
+    //std::locale::global(utf8_locale);
+
+    // Send the UTF-X data out, converting to UTF-8
+    {
+        std::wofstream ofs("data.ucd");
+        ofs.imbue(utf8_locale);
+        boost::copy(data, std::ostream_iterator<wchar_t, wchar_t>(ofs));
+    }
+
+    // Read the UTF-8 data back in, converting to UTF-X and normalizing on the way in
+    {
+        std::wifstream ifs("data.ucd");
+        ifs.imbue(utf8_locale);
+        wchar_t item = 0;
+        size_t i = 0;
+        while (ifs >> std::noskipws >> item)
+        {
+            BOOST_CHECK_EQUAL(data[i], item);
+            i++;
+        }
+        BOOST_CHECK_EQUAL(i, (size_t)boost::size(data));
+    }
+}
 
+BOOST_AUTO_TEST_CASE( codecvt_normalized )
+{
     std::locale old_locale;
-    std::locale utf8_locale(old_locale, new boost::unicode::utf_u8_codecvt());
+    std::locale utf8_locale(old_locale, new boost::unicode::utf_u8_normalize_codecvt(20));
 
     // Set a new global locale
     //std::locale::global(utf8_locale);
@@ -51,4 +81,5 @@
         BOOST_CHECK_EQUAL(i, (size_t)boost::size(data_normalized));
     }
 }
+
 //]
Modified: sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_locale.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_locale.cpp	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/test/unicode/test_locale.cpp	2010-08-08 20:58:42 EDT (Sun, 08 Aug 2010)
@@ -12,18 +12,10 @@
 namespace unicode = boost::unicode;
 using boost::char32;
 
-typedef boost::converter_codecvt<
-    wchar_t,
-    boost::unicode::utf_boundary,
-    boost::unicode::utf_transcoder<char>,
-    boost::unicode::utf_boundary,
-    boost::unicode::utf_transcoder<wchar_t>
-> utf_u8_codecvt;
-
 BOOST_AUTO_TEST_CASE( locale_custom )
 {
     std::locale old_locale;
-    std::locale loc(old_locale, new utf_u8_codecvt);
+    std::locale loc(old_locale, new unicode::utf_u8_codecvt());
     
     char input_utf8_[] = "hello \xc3\xa9 \xf0\xaa\x98\x80 world";
     boost::iterator_range<const char*> input_utf8 = boost::as_literal(input_utf8_);
@@ -34,7 +26,20 @@
     CHECK_EQUALS(unicode::adaptors::locale_decode(input_utf8, loc), output);
     std::cout << "------------ locale_encode custom ----------------\n" << std::endl;
     CHECK_EQUALS(unicode::adaptors::locale_encode(output, unicode::locale_encoder(unicode::utf_encoder<wchar_t>(), unicode::utf_locale_transcoder(loc))), input_utf8);
+}
+
+BOOST_AUTO_TEST_CASE( locale_custom_normalize )
+{
+    std::locale old_locale;
+    std::locale loc(old_locale, new unicode::utf_u8_normalize_codecvt());
     
+    char input_utf8_[] = "hello e\xcc\x81 \xf0\xaf\xa8\x9d world";
+    boost::iterator_range<const char*> input_utf8 = boost::as_literal(input_utf8_);
+    
+    char32 output[] = {'h', 'e', 'l', 'l', 'o', ' ', 0xE9, ' ', 0x2A600, ' ', 'w', 'o', 'r', 'l', 'd'};
+    
+    std::cout << "------------ locale_decode custom normalize ----------------\n" << std::endl;
+    CHECK_EQUALS(unicode::adaptors::locale_decode(input_utf8, loc), output);
 }
 
 BOOST_AUTO_TEST_CASE( locale_native )