$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r55578 - in sandbox/SOC/2009/unicode: boost/iterator boost/unicode libs/unicode/example
From: loufoque_at_[hidden]
Date: 2009-08-13 22:25:09
Author: mgaunard
Date: 2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
New Revision: 55578
URL: http://svn.boost.org/trac/boost/changeset/55578
Log:
Normalization support
Added:
   sandbox/SOC/2009/unicode/boost/unicode/cat.hpp   (contents, props changed)
Text files modified: 
   sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp |    77 ++++++++++++++++++++++++                
   sandbox/SOC/2009/unicode/boost/unicode/compose.hpp        |     1                                         
   sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp    |   127 ++++++++++++++++++++++++++++++++++----- 
   sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp         |    17 +++++                                   
   sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp |     8 ++                                      
   5 files changed, 210 insertions(+), 20 deletions(-)
Modified: sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/iterator/pipe_iterator.hpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -6,6 +6,8 @@
 
 #include <boost/range.hpp>
 #include <boost/mpl/int.hpp>
+#include <boost/mpl/times.hpp>
+#include <boost/tuple/tuple.hpp>
 
 #include <boost/concept/requires.hpp>
 #include <boost/range/concepts.hpp>
@@ -58,6 +60,81 @@
         return one_many_pipe<OneManyPipe>(p);
 }
 
+/* TODO: make it work for pipes that don't expose max_output */
+/** Model of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
+ * constructed from two models of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
+ * and that applies one after the other. */
+template<typename P1, typename P2>
+struct multi_pipe
+{
+    BOOST_CONCEPT_ASSERT((PipeConcept<P1>));
+    BOOST_CONCEPT_ASSERT((PipeConcept<P2>));
+    
+    BOOST_CONCEPT_ASSERT((Convertible<typename P1::output_type, typename P2::input_type>));
+    
+    typedef typename P1::input_type input_type;
+    typedef typename P2::output_type output_type;
+    
+    typedef typename mpl::times<
+        typename P1::max_output,
+        typename P2::max_output
+    >::type max_output;
+    
+    multi_pipe() {}
+    multi_pipe(P1 p1_, P2 p2_ = P2()) : p1(p1_), p2(p2_) {}
+    
+    template<typename In, typename Out>
+    std::pair<In, Out> ltr(In begin, In end, Out out)
+    {
+        typename P1::output_type buf[max_output::value];
+        typename P1::output_type* b = buf;
+        
+        std::pair<In, typename P1::output_type*> pair = p1.ltr(begin, end, buf);
+        typename P1::output_type* e = pair.second;
+        
+        do
+        {
+            tie(b, out) = p2.ltr(b, e, out);
+        }
+        while(b != e);
+        
+        return std::make_pair(pair.first, out);
+    }
+    
+    template<typename In, typename Out>
+    std::pair<In, Out> rtl(In begin, In end, Out out)
+    {
+        typename P1::output_type buf[max_output::value];
+        typename P1::output_type* b = buf;
+        
+        std::pair<In, typename P1::output_type*> pair = p1.rtl(begin, end, buf);
+        typename P1::output_type* e = pair.second;
+        
+        do
+        {
+            tie(b, out) = p2.ltr(b, e, out);
+        }
+        while(b != e);
+        
+        return std::make_pair(pair.first, out);
+    }
+    
+private:
+    P1 p1;
+    P2 p2;
+};
+
+template<typename P1, typename P2>
+BOOST_CONCEPT_REQUIRES(
+    ((PipeConcept<P1>))
+    ((PipeConcept<P2>))
+    ((Convertible<typename P1::output_type, typename P2::input_type>)),
+    (multi_pipe<P1, P2>)
+) make_multi_pipe(P1 p1, P2 p2)
+{
+    return multi_pipe<P1, P2>(p1, p2);
+}
+
 /** Model of \c \xmlonly<conceptname>OneManyPipe</conceptname>\endxmlonly
  * that casts its input to its template parameter and writes it to its output. */
 template<typename T>
Added: sandbox/SOC/2009/unicode/boost/unicode/cat.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2009/unicode/boost/unicode/cat.hpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -0,0 +1,6 @@
+#ifndef BOOST_UNICODE_CAT_HPP
+#define BOOST_UNICODE_CAT_HPP
+
+
+
+#endif
Modified: sandbox/SOC/2009/unicode/boost/unicode/compose.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/compose.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/unicode/compose.hpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -11,6 +11,7 @@
 
 BOOST_UNICODE_PIPE_DEF(compose, 0)
 BOOST_UNICODE_PIPE_DEF(decompose, 1)
+BOOST_UNICODE_PIPE_DEF(normalize, 1)
 
 } // namespace unicode
 } // namespace boost
Modified: sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/unicode/compose_fwd.hpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -7,9 +7,18 @@
 #include <boost/integer/static_pow.hpp>
 #include <climits>
 
-#include <boost/iterator/pipe_iterator.hpp>
 #include <vector>
 
+#include <boost/throw_exception.hpp>
+#include <stdexcept>
+#ifndef BOOST_NO_STD_LOCALE
+#include <sstream>
+#include <ios>
+#endif
+
+#include <boost/detail/unspecified.hpp>
+#include <boost/iterator/pipe_iterator.hpp>
+
 namespace boost
 {
 namespace unicode
@@ -21,6 +30,30 @@
 #undef BOOST_UNICODE_OPTION
 #endif
 
+namespace detail
+{
+    struct combining_pred
+    {
+        bool operator()(char32 lft, char32 rgt) const
+        {
+            return ucd::get_combining_class(lft) < ucd::get_combining_class(rgt);
+        }
+    };
+    
+    template<typename Size, typename Iterator, typename Comp>
+    void stable_sort_bounded(Iterator begin, Iterator end, Comp comp = std::less<typename std::iterator_traits<Iterator>::value_type>())
+    {
+#if defined(__GLIBCPP__) || defined(__GLIBCXX__) || defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) 
+        typename std::iterator_traits<Iterator>::value_type buf[Size::value];
+        return std::__stable_sort_adaptive(begin, end, buf, Size::value, comp);
+#else
+        return std::stable_sort(begin, end, comp);
+#endif
+    }
+
+}
+
+/* TODO: special case the case when Out is a RandomAccessIterator */
 /** Model of \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly
  * that decomposes a combining character sequence, i.e. it transforms a combining
  * character sequence into its canonically ordered decomposed equivalent.
@@ -31,53 +64,105 @@
     typedef char32 input_type;
     typedef char32 output_type;
     
+    typedef mpl::int_<31> max_output;
+    
     decomposer(unsigned mask_ = BOOST_UNICODE_OPTION(ucd::decomposition_type::canonical)) : mask(mask_)
     {
     }
     
-    /** \post \c out is in Normalization Form D. */
+    /** Throws \c std::out_of_range if [<tt>begin</tt>, <tt>end</tt>[ is not stream-safe.
+     * \post \c out is in Normalization Form D. */
     template<typename In, typename Out>
-    std::pair<In, Out> ltr(In begin, In end, Out out)
+    std::pair<In, Out> ltr(In begin, In end, Out out, bool inverse = false)
     {
+        In pos = begin;
+        
+        char32 buf[max_output::value];
+        char32* out_pos = buf;
+        
+        bool to_sort = false;
         do
         {
-            char32 ch = *begin;
+            char32 ch = *pos;
             if(ucd::get_combining_class(ch) != 0)
-            {
-                // canonical reorder, not handled yet
-            }
+                to_sort = true;
         
             iterator_range<const char32*> dec = ucd::get_decomposition(ch);
             if(!empty(dec) && ((1 << ucd::get_decomposition_type(ch)) & mask))
             {
-                out = pipe(dec, *this, out); // we decompose recursively
+                for(const char32* p = boost::begin(dec); p != boost::end(dec); ++p)
+                    out_pos = decompose_rec(*p, out_pos);
             }
             else if(BOOST_UNICODE_OPTION(ucd::decomposition_type::canonical) & mask)
             {
-                out = hangul_decomposer()(ch, out);
+                if((out_pos + hangul_decomposer::len(ch) - 1) != (buf + max_output::value))
+                    out_pos = hangul_decomposer()(ch, out_pos);
+                else
+                    not_stream_safe(begin, end);
+            }
+            else if(out_pos != (buf + max_output::value))
+            {
+                *out_pos++ = ch;
             }
             else
             {
-                *out++ = ch;
+                not_stream_safe(begin, end);
             }
             
-            ++begin;
+            ++pos;
         }
-        while(begin != end && ucd::get_combining_class(*begin) != 0);
+        while(pos != end && ((!inverse && ucd::get_combining_class(*pos) != 0) || (inverse && ucd::get_combining_class(*pos) == 0)));
         
-        return std::make_pair(begin, out);
+        if(to_sort)
+            detail::stable_sort_bounded<max_output>(buf, out_pos, detail::combining_pred());
+
+        out = std::copy(buf, out_pos, out);
+        return std::make_pair(pos, out);
     }
     
-    /** \post \c out is in Normalization Form D. */
+    /** Throws \c std::out_of_range if [<tt>begin</tt>, <tt>end</tt>[ is not stream-safe.
+     * \post \c out is in Normalization Form D. */
     template<typename In, typename Out>
     std::pair<In, Out> rtl(In begin, In end, Out out)
     {
-        // NOT IMPLEMENTED
-        *out++ = *--end;
-        return std::make_pair(end, out);
+        std::pair<
+            reverse_iterator<In>,
+            Out
+        > p = ltr(make_reverse_iterator(end), make_reverse_iterator(begin), out, true);
+        return std::make_pair(p.first.base(), p.second);
     }
     
 private:
+    template<typename Iterator>
+    static void not_stream_safe(Iterator begin, Iterator end)
+    {
+#ifndef BOOST_NO_STD_LOCALE
+	    std::stringstream ss;
+	    ss << "Invalid Unicode stream-safe combining character sequence " << std::showbase << std::hex;
+	    for(Iterator it = begin; it != end; ++it)
+		    ss << *it << " ";
+	    ss << "encountered while trying to decompose UTF-32 sequence";
+	    std::out_of_range e(ss.str());
+#else
+	    std::out_of_range e("Invalid Unicode stream-safe combining character sequence encountered while trying to decompose UTF-32 sequence");
+#endif
+	    boost::throw_exception(e);
+    }
+    
+    template<typename OutputIterator>
+    OutputIterator decompose_rec(char32 ch, OutputIterator out)
+    {
+        iterator_range<const char32*> dec = ucd::get_decomposition(ch);
+        if(!empty(dec) && ((1 << ucd::get_decomposition_type(ch)) & mask))
+        {
+            for(const char32* p = begin(dec); p != end(dec); ++p)
+                out = decompose_rec(*p, out);
+            return out;
+        }
+        *out++ = ch;
+        return out;
+    }
+
     unsigned mask;
 };
 
@@ -128,7 +213,8 @@
     typedef char32 output_type;
     typedef mpl::int_<1> max_output;
     
-    /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D. */
+    /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D.
+     *  \post \c out is in Normalization Form C. */
     template<typename In, typename Out>
     std::pair<In, Out> ltr(In begin, In end, Out out)
     {
@@ -172,7 +258,8 @@
     }
     
     /* This could by made faster using a sorted table of reversed strings */
-    /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D. */
+    /** \pre [<tt>begin</tt>, <tt>end</tt>[ is in Normalization Form D.
+     *  \post \c out is in Normalization Form C. */
     template<typename In, typename Out>
     std::pair<In, Out> rtl(In begin, In end, Out out)
     {
@@ -224,6 +311,8 @@
     }
 };
 
+typedef boost::detail::unspecified< multi_pipe<decomposer, composer> >::type normalizer;
+
 } // namespace unicode
 } // namespace boost
 
Modified: sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp
==============================================================================
--- sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp	(original)
+++ sandbox/SOC/2009/unicode/boost/unicode/hangul.hpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -32,6 +32,7 @@
  * Other code points are left unchanged. */
 struct hangul_decomposer
 {
+    typedef char32 input_type;
     typedef char32 output_type;
     typedef mpl::int_<3> max_output;
     
@@ -59,8 +60,23 @@
         return out;
     }
     
+    static int len(char32 ch)
+    {
+        using namespace detail;
+        
+        char32 SIndex = ch - SBase;
+        char32 TIndex = SIndex % TCount;
+        
+        if(SIndex < 0 || SIndex >= SCount)
+            return 1;
+        if(TIndex)
+            return 3;
+        return 2;
+    }
+    
 };
 
+/* TODO: implement it */
 /** \c \xmlonly<conceptname>Pipe</conceptname>\endxmlonly that
  * transforms <L, V>, <L, V, T> and <LV, T> Hangul code points sequences into the
  * LV and LVT Hangul syllables, since those compositions are not part
@@ -68,6 +84,7 @@
  * Other code points are left unchanged. */
 struct hangul_composer
 {
+    typedef char32 input_type;
     typedef char32 output_type;
     typedef mpl::int_<1> max_output;
     
Modified: sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp
==============================================================================
--- sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp	(original)
+++ sandbox/SOC/2009/unicode/libs/unicode/example/compose.cpp	2009-08-13 22:25:08 EDT (Thu, 13 Aug 2009)
@@ -9,6 +9,8 @@
 #include <iostream>
 #include <iterator>
 
+#include <boost/range/adaptor/reversed.hpp>
+
 namespace unicode = boost::unicode;
 namespace ucd = unicode::ucd;
 
@@ -39,8 +41,12 @@
     std::cout << std::endl;
     std::cout << "Decomposition type: " << as_string(ucd::get_decomposition_type(cp)) << std::endl;
     
+    boost::char32 baz[] = { cp, 0x330 };
     std::cout << "Canonical decomposition: ";
-    unicode::decompose(boost::list_of(cp), std::ostream_iterator<boost::char32>(std::cout, " "));
+    std::cout << unicode::composed(unicode::decomposed(baz)) << std::endl;
+    std::cout << "reversed: " << boost::make_reversed_range(unicode::composed(unicode::decomposed(baz))) << std::endl;
+    std::cout << unicode::normalized(baz) << std::endl;
+    std::cout << "reversed: " << boost::make_reversed_range(unicode::normalized(baz));
     std::cout << std::endl << std::endl;
     
     std::cout << "Canonical decomposition of U+00A8: ";