$include_dir="/home/hyper-archives/boost-commit/include"; include("$include_dir/msg-header.inc") ?>
Subject: [Boost-commit] svn:boost r65943 - in trunk: boost/regex/v4 libs/regex/src libs/regex/test/regress
From: john_at_[hidden]
Date: 2010-10-13 12:53:15
Author: johnmaddock
Date: 2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
New Revision: 65943
URL: http://svn.boost.org/trac/boost/changeset/65943
Log:
Fix code to handle multiple named-subexpressions with the same name.
Updated test cases to match.
Text files modified: 
   trunk/boost/regex/v4/basic_regex.hpp                |   102 ++++++++++----------------------------- 
   trunk/boost/regex/v4/basic_regex_creator.hpp        |     6 ++                                      
   trunk/boost/regex/v4/basic_regex_parser.hpp         |    32 +++++++++--                             
   trunk/boost/regex/v4/match_results.hpp              |    26 +++++++--                               
   trunk/boost/regex/v4/perl_matcher_common.hpp        |    61 ++++++++++++++++++++---                 
   trunk/libs/regex/src/regex_traits_defaults.cpp      |     2                                         
   trunk/libs/regex/test/regress/test_perl_ex.cpp      |    11 ++++                                    
   trunk/libs/regex/test/regress/test_replace.cpp      |     6 ++                                      
   trunk/libs/regex/test/regress/test_tricky_cases.cpp |     2                                         
   9 files changed, 149 insertions(+), 99 deletions(-)
Modified: trunk/boost/regex/v4/basic_regex.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex.hpp	(original)
+++ trunk/boost/regex/v4/basic_regex.hpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -53,7 +53,7 @@
    if(first != last)
    {
       I next = last - 1;
-      while((next != first) && !(*(next-1) < *next))
+      while((next != first) && (*next < *(next-1)))
       {
          (next-1)->swap(*next);
          --next;
@@ -61,22 +61,6 @@
    }
 }
 
-//
-// Class named_subexpressions
-// Contains information about named subexpressions within the regex.
-//
-template <class charT>
-class named_subexpressions_base
-{
-public:
-   virtual int get_id(const charT* i, const charT* j)const = 0;
-   virtual int get_id(std::size_t h)const = 0;
-#ifdef __GNUC__
-   // warning supression:
-   virtual ~named_subexpressions_base(){}
-#endif
-};
-
 template <class Iterator>
 inline std::size_t hash_value_from_capture_name(Iterator i, Iterator j)
 {
@@ -86,13 +70,14 @@
    return r;
 }
 
-template <class charT>
-class named_subexpressions : public named_subexpressions_base<charT>
+class named_subexpressions
 {
+public:
    struct name
    {
+      template <class charT>
       name(const charT* i, const charT* j, int idx)
-         : /*n(i, j), */ index(idx) 
+         : index(idx) 
       { 
          hash = hash_value_from_capture_name(i, j); 
       }
@@ -100,31 +85,35 @@
          : index(idx), hash(h)
       { 
       }
-      //std::vector<charT> n;
       int index;
       std::size_t hash;
       bool operator < (const name& other)const
       {
-         return hash < other.hash; //std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end());
+         return hash < other.hash;
       }
       bool operator == (const name& other)const
       {
-         return hash == other.hash; //n == other.n;
+         return hash == other.hash; 
       }
       void swap(name& other)
       {
-         //n.swap(other.n);
          std::swap(index, other.index);
          std::swap(hash, other.hash);
       }
    };
-public:
+
+   typedef std::vector<name>::const_iterator const_iterator;
+   typedef std::pair<const_iterator, const_iterator> range_type;
+
    named_subexpressions(){}
+
+   template <class charT>
    void set_name(const charT* i, const charT* j, int index)
    {
       m_sub_names.push_back(name(i, j, index));
       bubble_down_one(m_sub_names.begin(), m_sub_names.end());
    }
+   template <class charT>
    int get_id(const charT* i, const charT* j)const
    {
       name t(i, j, 0);
@@ -135,72 +124,37 @@
       }
       return -1;
    }
+   template <class charT>
+   range_type equal_range(const charT* i, const charT* j)const
+   {
+      name t(i, j, 0);
+      return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
+   }
    int get_id(std::size_t h)const
    {
       name t(h, 0);
-      typename std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
+      std::vector<name>::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t);
       if((pos != m_sub_names.end()) && (*pos == t))
       {
          return pos->index;
       }
       return -1;
    }
-private:
-   std::vector<name> m_sub_names;
-};
-
-template <class charT, class Other>
-class named_subexpressions_converter : public named_subexpressions_base<charT>
-{
-   boost::shared_ptr<named_subexpressions<Other> > m_converter;
-public:
-   named_subexpressions_converter(boost::shared_ptr<named_subexpressions<Other> > s)
-      : m_converter(s) {}
-   int get_id(const charT* i, const charT* j)const
-   {
-      if(i == j)
-         return -1;
-      std::vector<Other> v;
-      while(i != j)
-      {
-         v.push_back(*i);
-         ++i;
-      }
-      return m_converter->get_id(&v[0], &v[0] + v.size());
-   }
-   int get_id(std::size_t h)const
+   range_type equal_range(std::size_t h)const
    {
-      return m_converter->get_id(h);
+      name t(h, 0);
+      return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t);
    }
+private:
+   std::vector<name> m_sub_names;
 };
 
-template <class To>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
-   boost::shared_ptr<named_subexpressions<To> > s, 
-   boost::integral_constant<bool,true> const&)
-{
-   return s;
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs_imp(
-   boost::shared_ptr<named_subexpressions<From> > s, 
-   boost::integral_constant<bool,false> const&)
-{
-   return boost::shared_ptr<named_subexpressions_converter<To, From> >(new named_subexpressions_converter<To, From>(s));
-}
-template <class To, class From>
-inline boost::shared_ptr<named_subexpressions_base<To> > convert_to_named_subs(
-   boost::shared_ptr<named_subexpressions<From> > s)
-{
-   typedef typename boost::is_same<To, From>::type tag_type;
-   return convert_to_named_subs_imp<To>(s, tag_type());
-}
 //
 // class regex_data:
 // represents the data we wish to expose to the matching algorithms.
 //
 template <class charT, class traits>
-struct regex_data : public named_subexpressions<charT>
+struct regex_data : public named_subexpressions
 {
    typedef regex_constants::syntax_option_type   flag_type;
    typedef std::size_t                           size_type;  
@@ -672,7 +626,7 @@
       BOOST_ASSERT(0 != m_pimpl.get());
       return m_pimpl->get_data();
    }
-   boost::shared_ptr<re_detail::named_subexpressions<charT> > get_named_subs()const
+   boost::shared_ptr<re_detail::named_subexpressions > get_named_subs()const
    {
       return m_pimpl;
    }
Modified: trunk/boost/regex/v4/basic_regex_creator.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_creator.hpp	(original)
+++ trunk/boost/regex/v4/basic_regex_creator.hpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -806,7 +806,13 @@
             re_syntax_base* p = base;
             std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
             if(idx > 10000)
+            {
+               //
+               // There may be more than one capture group with this hash, just do what Perl
+               // does and recurse to the leftmost:
+               //
                idx = m_pdata->get_id(idx);
+            }
             while(p)
             {
                if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))
Modified: trunk/boost/regex/v4/basic_regex_parser.hpp
==============================================================================
--- trunk/boost/regex/v4/basic_regex_parser.hpp	(original)
+++ trunk/boost/regex/v4/basic_regex_parser.hpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -820,7 +820,11 @@
             return false;
          }
          // maybe have \g{ddd}
-         if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
+         regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
+         regex_constants::syntax_type syn_end = 0;
+         if((syn == regex_constants::syntax_open_brace) 
+            || (syn == regex_constants::escape_type_left_word)
+            || (syn == regex_constants::escape_type_end_buffer))
          {
             if(++m_position == m_end)
             {
@@ -828,6 +832,18 @@
                return false;
             }
             have_brace = true;
+            switch(syn)
+            {
+            case regex_constants::syntax_open_brace:
+               syn_end = regex_constants::syntax_close_brace;
+               break;
+            case regex_constants::escape_type_left_word:
+               syn_end = regex_constants::escape_type_right_word;
+               break;
+            default:
+               syn_end = regex_constants::escape_type_end_buffer;
+               break;
+            }
          }
          negative = (*m_position == static_cast<charT>('-'));
          if((negative) && (++m_position == m_end))
@@ -837,18 +853,20 @@
          }
          const charT* pc = m_position;
          int i = this->m_traits.toi(pc, m_end, 10);
-         if(i < 0)
+         if((i < 0) && syn_end)
          {
-            // Check for a named capture:
+            // Check for a named capture, get the leftmost one if there is more than one:
             const charT* base = m_position;
-            while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+            while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
+            {
                ++m_position;
-            i = this->m_pdata->get_id(base, m_position);
+            }
+            i = hash_value_from_capture_name(base, m_position);
             pc = m_position;
          }
          if(negative)
             i = 1 + m_mark_count - i;
-         if((i > 0) && (this->m_backrefs & (1u << (i-1))))
+         if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
          {
             m_position = pc;
             re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
@@ -863,7 +881,7 @@
          m_position = pc;
          if(have_brace)
          {
-            if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
+            if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
             {
                fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
                return false;
Modified: trunk/boost/regex/v4/match_results.hpp
==============================================================================
--- trunk/boost/regex/v4/match_results.hpp	(original)
+++ trunk/boost/regex/v4/match_results.hpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -38,7 +38,6 @@
 
 namespace re_detail{
 
-template <class charT>
 class named_subexpressions;
 
 }
@@ -69,7 +68,7 @@
    typedef typename re_detail::regex_iterator_traits<
                                     BidiIterator>::value_type               char_type;
    typedef          std::basic_string<char_type>                            string_type;
-   typedef          re_detail::named_subexpressions_base<char_type>         named_sub_type;
+   typedef          re_detail::named_subexpressions                         named_sub_type;
 
    // construct/copy/destroy:
    explicit match_results(const Allocator& a = Allocator())
@@ -225,10 +224,15 @@
    //
    const_reference named_subexpression(const char_type* i, const char_type* j) const
    {
+      //
+      // Scan for the leftmost *matched* subexpression with the specified named:
+      //
       if(m_is_singular)
          raise_logic_error();
-      int index = m_named_subs->get_id(i, j);
-      return index > 0 ? (*this)[index] : m_null;
+      re_detail::named_subexpressions::range_type r = m_named_subs->equal_range(i, j);
+      while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+         ++r.first;
+      return r.first != r.second ? (*this)[r.first->index] : m_null;
    }
    template <class charT>
    const_reference named_subexpression(const charT* i, const charT* j) const
@@ -243,10 +247,20 @@
    }
    int named_subexpression_index(const char_type* i, const char_type* j) const
    {
+      //
+      // Scan for the leftmost *matched* subexpression with the specified named.
+      // If none found then return the leftmost expression with that name,
+      // otherwise an invalid index:
+      //
       if(m_is_singular)
          raise_logic_error();
-      int index = m_named_subs->get_id(i, j);
-      return index > 0 ? index : -20;
+      re_detail::named_subexpressions::range_type s, r;
+      s = r = m_named_subs->equal_range(i, j);
+      while((r.first != r.second) && ((*this)[r.first->index].matched == false))
+         ++r.first;
+      if(r.first == r.second)
+         r = s;
+      return r.first != r.second ? r.first->index : -20;
    }
    template <class charT>
    int named_subexpression_index(const charT* i, const charT* j) const
Modified: trunk/boost/regex/v4/perl_matcher_common.hpp
==============================================================================
--- trunk/boost/regex/v4/perl_matcher_common.hpp	(original)
+++ trunk/boost/regex/v4/perl_matcher_common.hpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -200,7 +200,7 @@
    m_match_flags |= regex_constants::match_all;
    m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last);
    m_presult->set_base(base);
-   m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+   m_presult->set_named_subs(this->re.get_named_subs());
    if(m_match_flags & match_posix)
       m_result = *m_presult;
    verify_options(re.flags(), m_match_flags);
@@ -262,7 +262,7 @@
       pstate = re.get_first_state();
       m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last);
       m_presult->set_base(base);
-      m_presult->set_named_subs(re_detail::convert_to_named_subs<typename match_results<BidiIterator>::char_type>(this->re.get_named_subs()));
+      m_presult->set_named_subs(this->re.get_named_subs());
       m_match_flags |= regex_constants::match_init;
    }
    else
@@ -588,8 +588,23 @@
    // in the match, this is in line with ECMAScript, but not Perl
    // or PCRE.
    //
-   BidiIterator i = (*m_presult)[static_cast<const re_brace*>(pstate)->index].first;
-   BidiIterator j = (*m_presult)[static_cast<const re_brace*>(pstate)->index].second;
+   int index = static_cast<const re_brace*>(pstate)->index;
+   if(index >= 10000)
+   {
+      named_subexpressions::range_type r = re.get_data().equal_range(index);
+      BOOST_ASSERT(r.first != r.second);
+      do
+      {
+         index = r.first->index;
+         ++r.first;
+      }while((r.first != r.second) && ((*m_presult)[index].matched != true));
+   }
+
+   if((m_match_flags & match_perl) && !(*m_presult)[index].matched)
+      return false;
+
+   BidiIterator i = (*m_presult)[index].first;
+   BidiIterator j = (*m_presult)[index].second;
    while(i != j)
    {
       if((position == last) || (traits_inst.translate(*position, icase) != traits_inst.translate(*i, icase)))
@@ -713,7 +728,7 @@
 {
    // return true if marked sub-expression N has been matched:
    int index = static_cast<const re_brace*>(pstate)->index;
-   bool result;
+   bool result = false;
    if(index == 9999)
    {
       // Magic value for a (DEFINE) block:
@@ -721,11 +736,25 @@
    }
    else if(index > 0)
    {
+      // Have we matched subexpression "index"?
       // Check if index is a hash value:
       if(index >= 10000)
-         index = re.get_data().get_id(index);
-      // Have we matched subexpression "index"?
-      result = (*m_presult)[index].matched;
+      {
+         named_subexpressions::range_type r = re.get_data().equal_range(index);
+         while(r.first != r.second)
+         {
+            if((*m_presult)[r.first->index].matched)
+            {
+               result = true;
+               break;
+            }
+            ++r.first;
+         }
+      }
+      else
+      {
+         result = (*m_presult)[index].matched;
+      }
       pstate = pstate->next.p;
    }
    else
@@ -734,8 +763,20 @@
       // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
       int idx = -index-1;
       if(idx >= 10000)
-         idx = re.get_data().get_id(idx);
-      result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+      {
+         named_subexpressions::range_type r = re.get_data().equal_range(idx);
+         int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;
+         while(r.first != r.second)
+         {
+            result |= (stack_index == r.first->index);
+            if(result)break;
+            ++r.first;
+         }
+      }
+      else
+      {
+         result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0));
+      }
       pstate = pstate->next.p;
    }
    return result;
Modified: trunk/libs/regex/src/regex_traits_defaults.cpp
==============================================================================
--- trunk/libs/regex/src/regex_traits_defaults.cpp	(original)
+++ trunk/libs/regex/src/regex_traits_defaults.cpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -100,7 +100,7 @@
          "p",
          "P",
          "N",
-         "g",
+         "gk",
          "K",
          "R",
    };
Modified: trunk/libs/regex/test/regress/test_perl_ex.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_perl_ex.cpp	(original)
+++ trunk/libs/regex/test/regress/test_perl_ex.cpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -907,5 +907,16 @@
    // Bugs:
    TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(); } }", match_default, make_array(0, 46, 10, 13, 14, 46, -2, -2));
    TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(){} } { {{{ }  } } } {}}", match_default, make_array(0, 64, 10, 13, 14, 64, -2, -2));
+
+   // Recursion to a named sub with a name that is used multiple times:
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "aaaa.aa", match_default, make_array(0, 7, 0, 4, -1, -1, -2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.aa", match_default, make_array(0, 7, -1, -1, 0, 4, -2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.(?&A)", perl, "bbbb.bb", match_default, make_array(-2, -2));
+   // Back reference to a named sub with a name that is used multiple times:
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.aaaa", match_default, make_array(0, 9, 0, 4, -1, -1, -2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.aaaa", match_default, make_array(-2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "aaaa.bbbb", match_default, make_array(-2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+))\\.\\k<A>", perl, "bbbb.bbbb", match_default, make_array(0, 9, -1, -1, 0, 4, -2, -2));
+   TEST_REGEX_SEARCH("(?:(?<A>a+)|(?<A>b+)|c+)\\.\\k<A>", perl, "cccc.cccc", match_default, make_array(-2, -2));
 }
 
Modified: trunk/libs/regex/test/regress/test_replace.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_replace.cpp	(original)
+++ trunk/libs/regex/test/regress/test_replace.cpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -184,5 +184,11 @@
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "$2", "bb");
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "d$+{one}c", "daac");
    TEST_REGEX_REPLACE("(?<one>a+)(?<two>b+)", perl, " ...aabb,,", match_default|format_no_copy, "c$+{two}d", "cbbd");
+
+   TEST_REGEX_REPLACE("(?<one>a)(?<one>b)(c)", perl, " ...abc,,", match_default, "$1.$2.$3.$+{one}", " ...a.b.c.a,,");
+   TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...a,,", match_default, "$1.$2.$+{one}", " ...a..a,,");
+   TEST_REGEX_REPLACE("(?:(?<one>a)|(?<one>b))", perl, " ...b,,", match_default, "$1.$2.$+{one}", " ....b.b,,");
+   TEST_REGEX_REPLACE("(?:(?<one>a)(?<one>b))", perl, " ...ab,,", match_default, "$1.$2.$+{one}", " ...a.b.a,,");
+
 }
 
Modified: trunk/libs/regex/test/regress/test_tricky_cases.cpp
==============================================================================
--- trunk/libs/regex/test/regress/test_tricky_cases.cpp	(original)
+++ trunk/libs/regex/test/regress/test_tricky_cases.cpp	2010-10-13 12:53:13 EDT (Wed, 13 Oct 2010)
@@ -49,7 +49,7 @@
    TEST_REGEX_SEARCH("(a)d|(b)c", perl, "abc", match_default, make_array(1, 3, -1, -1, 1, 2, -2, -2));
    TEST_REGEX_SEARCH("_+((www)|(ftp)|(mailto)):_*", perl, "_wwwnocolon _mailto:", match_default, make_array(12, 20, 13, 19, -1, -1, -1, -1, 13, 19, -2, -2));
    // subtleties of matching
-   TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(0, 3, -1, -1, -2, -2));
+   TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(-2, -2));
    TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default, make_array(0, 4, 2, 3, -2, -2));
    TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default, make_array(0, 10, 0, 3, 3, 10, -2, -2));
    TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2));