$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r59677 - in trunk/tools/quickbook: detail test
From: daniel_james_at_[hidden]
Date: 2010-02-14 08:09:53
Author: danieljames
Date: 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
New Revision: 59677
URL: http://svn.boost.org/trac/boost/changeset/59677
Log:
Support UTF-8 BOM if present.
I'm not sure if the utf-16/32 checks are the right thing to do, but as
quickbook only supports utf-8 for now, they'll work okay. Hopefully if
it ever support other encodings this should be offloaded to an
appropriate library.
Added:
   trunk/tools/quickbook/test/utf-16be-bom.quickbook   (contents, props changed)
   trunk/tools/quickbook/test/utf-16le-bom.quickbook   (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.gold   (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.quickbook   (contents, props changed)
   trunk/tools/quickbook/test/utf-8.gold   (contents, props changed)
   trunk/tools/quickbook/test/utf-8.quickbook   (contents, props changed)
Text files modified: 
   trunk/tools/quickbook/detail/utils.cpp |    75 +++++++++++++++++++++++++++++++++++++-- 
   trunk/tools/quickbook/test/Jamfile.v2  |     7 ++-                                     
   2 files changed, 75 insertions(+), 7 deletions(-)
Modified: trunk/tools/quickbook/detail/utils.cpp
==============================================================================
--- trunk/tools/quickbook/detail/utils.cpp	(original)
+++ trunk/tools/quickbook/detail/utils.cpp	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -181,13 +181,74 @@
         }
     }
 
+    // Read the first few bytes in a file to see it starts with a byte order
+    // mark. If it doesn't, then write the characters we've already read in.
+    // Although, given how UTF-8 works, if we've read anything in, the files
+    // probably broken.
+
+    template <class InputIterator, class OutputIterator>
+    bool check_bom(InputIterator& begin, InputIterator end,
+            OutputIterator out, char const* chars, int length)
+    {
+        char const* ptr = chars;
+
+        while(begin != end && *begin == *ptr) {
+            ++begin;
+            ++ptr;
+            --length;
+            if(length == 0) return true;
+        }
+
+        // Failed to match, so write the skipped characters to storage:
+        while(chars != ptr) *out++ = *chars++;
+
+        return false;
+    }
+    
+    template <class InputIterator, class OutputIterator>
+    std::string read_bom(InputIterator& begin, InputIterator end,
+            OutputIterator out)
+    {
+        if(begin == end) return "";
+
+        const char utf8[] = {0xef, 0xbb, 0xbf};
+        const char utf32be[] = {0, 0, 0xfe, 0xff};
+        const char utf32le[] = {0xff, 0xfe, 0, 0};
+
+        unsigned char c = *begin;
+        switch(c)
+        {
+        case 0xEF: { // UTF-8
+            return check_bom(begin, end, out, utf8, 3) ? "UTF-8" : "";
+        }
+        case 0xFF: // UTF-16/UTF-32 little endian
+            return !check_bom(begin, end, out, utf32le, 2) ? "" :
+                check_bom(begin, end, out, utf32le + 2, 2) ? "UTF-32" : "UTF-16";
+        case 0: // UTF-32 big endian
+            return check_bom(begin, end, out, utf32be, 4) ? "UTF-32" : "";
+        case 0xFE: // UTF-16 big endian
+            return check_bom(begin, end, out, utf32be + 2, 2) ? "UTF-16" : "";
+        default:
+            return "";
+        }
+    }
+
     // Copy a string, converting mac and windows style newlines to unix
     // newlines.
 
     template <class InputIterator, class OutputIterator>
-    void normalize_newlines(InputIterator begin, InputIterator end,
-            OutputIterator out)
+    bool normalize(InputIterator begin, InputIterator end,
+            OutputIterator out, std::string const& filename)
     {
+        std::string encoding = read_bom(begin, end, out);
+
+        if(encoding != "UTF-8" && encoding != "") {
+            outerr(filename) << encoding << " is not supported. Please use UTF-8."
+                << std::endl;
+
+            return false;
+        }
+    
         while(begin != end) {
             if(*begin == '\r') {
                 *out++ = '\n';
@@ -198,6 +259,8 @@
                 *out++ = *begin++;
             }
         }
+        
+        return true;
     }
 
     int load(std::string const& filename, std::string& storage)
@@ -219,10 +282,14 @@
         // Turn off white space skipping on the stream
         in.unsetf(ios::skipws);
 
-        normalize_newlines(
+        if(!normalize(
             istream_iterator<char>(in),
             istream_iterator<char>(),
-            std::back_inserter(storage));
+            std::back_inserter(storage),
+            filename))
+        {
+            return 1;
+        }
 
         //  ensure that we have enough trailing newlines to eliminate
         //  the need to check for end of file in the grammar.
Modified: trunk/tools/quickbook/test/Jamfile.v2
==============================================================================
--- trunk/tools/quickbook/test/Jamfile.v2	(original)
+++ trunk/tools/quickbook/test/Jamfile.v2	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -43,7 +43,8 @@
     [ quickbook-fail-test fail-parse-error1 ]
     [ quickbook-fail-test fail-parse-error2 ]
     [ quickbook-fail-test fail-template-lookup1 ]
+    [ quickbook-test utf-8 ]
+    [ quickbook-test utf-8-bom ]
+    [ quickbook-fail-test utf-16be-bom ]
+    [ quickbook-fail-test utf-16le-bom ]
     ;
-
-
-
Added: trunk/tools/quickbook/test/utf-16be-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16be-bom.quickbook	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ
Added: trunk/tools/quickbook/test/utf-16le-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16le-bom.quickbook	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ
Added: trunk/tools/quickbook/test/utf-8-bom.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.gold	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+  <title>UTF-8 test</title>
+  <articleinfo>
+  </articleinfo>
+  <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+  <bridgehead renderas="sect2">
+    <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônàlizætiøn</link>
+  </bridgehead>
+  <itemizedlist>
+    <listitem>
+      Îα Alpha
+    </listitem>
+    <listitem>
+      Îβ Beta
+    </listitem>
+    <listitem>
+      Îγ Gamma
+    </listitem>
+    <listitem>
+      Îδ Delta
+    </listitem>
+    <listitem>
+      Îε Epsilon
+    </listitem>
+    <listitem>
+      Îζ Zeta
+    </listitem>
+    <listitem>
+      Îη Eta
+    </listitem>
+    <listitem>
+      Îθ Theta
+    </listitem>
+    <listitem>
+      Îι Iota
+    </listitem>
+    <listitem>
+      Îκ Kappa
+    </listitem>
+    <listitem>
+      Îλ Lambda
+    </listitem>
+    <listitem>
+      Îμ Mu
+    </listitem>
+    <listitem>
+      Îν Nu
+    </listitem>
+    <listitem>
+      Îξ Xi
+    </listitem>
+    <listitem>
+      Îο Omicron
+    </listitem>
+    <listitem>
+      Î Ï Pi
+    </listitem>
+    <listitem>
+      Î¡Ï Rho
+    </listitem>
+    <listitem>
+      ΣÏÏ Sigma
+    </listitem>
+    <listitem>
+      Î¤Ï Tau
+    </listitem>
+    <listitem>
+      Î¥Ï
 Upsilon
+    </listitem>
+    <listitem>
+      Î¦Ï Phi
+    </listitem>
+    <listitem>
+      Î§Ï Chi
+    </listitem>
+    <listitem>
+      Î¨Ï Psi
+    </listitem>
+    <listitem>
+      Î©Ï Omega
+    </listitem>
+  </itemizedlist>
+</article>
Added: trunk/tools/quickbook/test/utf-8-bom.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.quickbook	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+    [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônàlizætiøn]
+
+* Îα Alpha
+* Îβ Beta
+* Îγ Gamma
+* Îδ Delta
+* Îε Epsilon
+* Îζ Zeta
+* Îη Eta
+* Îθ Theta
+* Îι Iota
+* Îκ Kappa
+* Îλ Lambda
+* Îμ Mu
+* Îν Nu
+* Îξ Xi
+* Îο Omicron
+* Î Ï Pi
+* Î¡Ï Rho
+* ΣÏÏ Sigma
+* Î¤Ï Tau
+* Î¥Ï
 Upsilon
+* Î¦Ï Phi
+* Î§Ï Chi
+* Î¨Ï Psi
+* Î©Ï Omega
\ No newline at end of file
Added: trunk/tools/quickbook/test/utf-8.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.gold	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+  <title>UTF-8 test</title>
+  <articleinfo>
+  </articleinfo>
+  <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+  <bridgehead renderas="sect2">
+    <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônàlizætiøn</link>
+  </bridgehead>
+  <itemizedlist>
+    <listitem>
+      Îα Alpha
+    </listitem>
+    <listitem>
+      Îβ Beta
+    </listitem>
+    <listitem>
+      Îγ Gamma
+    </listitem>
+    <listitem>
+      Îδ Delta
+    </listitem>
+    <listitem>
+      Îε Epsilon
+    </listitem>
+    <listitem>
+      Îζ Zeta
+    </listitem>
+    <listitem>
+      Îη Eta
+    </listitem>
+    <listitem>
+      Îθ Theta
+    </listitem>
+    <listitem>
+      Îι Iota
+    </listitem>
+    <listitem>
+      Îκ Kappa
+    </listitem>
+    <listitem>
+      Îλ Lambda
+    </listitem>
+    <listitem>
+      Îμ Mu
+    </listitem>
+    <listitem>
+      Îν Nu
+    </listitem>
+    <listitem>
+      Îξ Xi
+    </listitem>
+    <listitem>
+      Îο Omicron
+    </listitem>
+    <listitem>
+      Î Ï Pi
+    </listitem>
+    <listitem>
+      Î¡Ï Rho
+    </listitem>
+    <listitem>
+      ΣÏÏ Sigma
+    </listitem>
+    <listitem>
+      Î¤Ï Tau
+    </listitem>
+    <listitem>
+      Î¥Ï
 Upsilon
+    </listitem>
+    <listitem>
+      Î¦Ï Phi
+    </listitem>
+    <listitem>
+      Î§Ï Chi
+    </listitem>
+    <listitem>
+      Î¨Ï Psi
+    </listitem>
+    <listitem>
+      Î©Ï Omega
+    </listitem>
+  </itemizedlist>
+</article>
Added: trunk/tools/quickbook/test/utf-8.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.quickbook	2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+    [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônàlizætiøn]
+
+* Îα Alpha
+* Îβ Beta
+* Îγ Gamma
+* Îδ Delta
+* Îε Epsilon
+* Îζ Zeta
+* Îη Eta
+* Îθ Theta
+* Îι Iota
+* Îκ Kappa
+* Îλ Lambda
+* Îμ Mu
+* Îν Nu
+* Îξ Xi
+* Îο Omicron
+* Î Ï Pi
+* Î¡Ï Rho
+* ΣÏÏ Sigma
+* Î¤Ï Tau
+* Î¥Ï
 Upsilon
+* Î¦Ï Phi
+* Î§Ï Chi
+* Î¨Ï Psi
+* Î©Ï Omega
\ No newline at end of file