$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r76790 - branches/quickbook-dev/tools/quickbook/src
From: dnljms_at_[hidden]
Date: 2012-01-29 19:06:37
Author: danieljames
Date: 2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
New Revision: 76790
URL: http://svn.boost.org/trac/boost/changeset/76790
Log:
Quickbook: Fix handling UTF-8 characters in the syntax highlighter.
So far, quickbook has just pretended that it's dealing with a single
byte encoding.  For the most part this works quite well due to the
design of UTF-8, but the syntax highlighter needs to pick out individual
characters correctly, so implement just enough to do that. Does odd
things with invalid UTF-8.
Text files modified: 
   branches/quickbook-dev/tools/quickbook/src/parsers.hpp          |    42 ++++++++++++++++++++++++++++++++++++++++
   branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp |    10 ++++----                                
   2 files changed, 47 insertions(+), 5 deletions(-)
Modified: branches/quickbook-dev/tools/quickbook/src/parsers.hpp
==============================================================================
--- branches/quickbook-dev/tools/quickbook/src/parsers.hpp	(original)
+++ branches/quickbook-dev/tools/quickbook/src/parsers.hpp	2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
@@ -253,6 +253,48 @@
     };
     
     lookback_gen const lookback = lookback_gen();
+ 
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    // UTF-8 code point
+    //
+    // Very crude, it doesn't check that the code point is in any way valid.
+    // Just looks for the beginning of the next character. This is just for
+    // implementing some crude fixes, rather than full unicode support. I'm
+    // sure experts would be appalled.
+    //
+    ///////////////////////////////////////////////////////////////////////////
+
+    struct utf8_char_parser : public cl::parser<utf8_char_parser>
+    {
+        typedef utf8_char_parser self_t;
+
+        template <typename Scanner>
+        struct result
+        {
+            typedef cl::match<> type;
+        };
+
+        template <typename Scanner>
+        typename result<Scanner>::type parse(Scanner const& scan) const
+        {
+            typedef typename Scanner::iterator_t iterator_t;
+
+            if (scan.at_end()) return scan.no_match();
+
+            iterator_t save(scan.first);
+
+            do {
+                ++scan.first;
+            } while (!scan.at_end() &&
+                    ((unsigned char) *scan.first & 0xc0) == 0x80);
+
+            return scan.create_match(scan.first.base() - save.base(),
+                    cl::nil_t(), save, scan.first);
+        }
+    };
+  
+    utf8_char_parser const utf8_char_p = utf8_char_parser();
 }
 
 #endif // BOOST_QUICKBOOK_SCOPED_BLOCK_HPP
Modified: branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp
==============================================================================
--- branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp	(original)
+++ branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp	2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
@@ -287,7 +287,7 @@
                     |   string_                         [span("string")]
                     |   char_                           [span("char")]
                     |   number                          [span("number")]
-                    |   cl::repeat_p(1)[cl::anychar_p]  [unexpected_char]
+                    |   utf8_char_p                     [unexpected_char]
                     )
                     ;
 
@@ -362,7 +362,7 @@
                     =   +cl::chset_p("~!%^&*()+={[}]:;,<.>?/|\\-")
                     ;
 
-                string_char = ('\\' >> cl::anychar_p) | (cl::anychar_p - '\\');
+                string_char = ('\\' >> utf8_char_p) | (cl::anychar_p - '\\');
 
                 string_
                     =   !cl::as_lower_d['l'] >> cl::confix_p('"', *string_char, '"')
@@ -442,7 +442,7 @@
                     |   special                         [span("special")]
                     |   string_                         [span("string")]
                     |   number                          [span("number")]
-                    |   cl::repeat_p(1)[cl::anychar_p]  [unexpected_char]
+                    |   utf8_char_p                     [unexpected_char]
                     )
                     ;
 
@@ -498,7 +498,7 @@
                     =   ! string_prefix >> (long_string | short_string)
                     ;
 
-                string_char = ('\\' >> cl::anychar_p) | (cl::anychar_p - '\\');
+                string_char = ('\\' >> utf8_char_p) | (cl::anychar_p - '\\');
             
                 short_string
                     =   cl::confix_p('\'', * string_char, '\'') |
@@ -564,7 +564,7 @@
                     =
                     *(  macro
                     |   escape          
-                    |   cl::repeat_p(1)[cl::anychar_p]  [plain_char]
+                    |   utf8_char_p                     [plain_char]
                     )
                     ;