urbit/outside/re2/ucs2.diff

This is a dump from Google's source control system of the change
that removed UCS-2 support from RE2.  As the explanation below
says, UCS-2 mode is fundamentally at odds with things like ^ and $,
so it never really worked very well.  But if you are interested in using
it without those operators, it did work for that.  It assumed that the
UCS-2 data was in the native host byte order.

If you are interested in adding UCS-2 mode back, this patch might
be a good starting point.


Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15

	Retire UCS-2 mode.
	
	I added it as an experiment for V8, but it
	requires 2-byte lookahead to do completely,
	and RE2 has 1-byte lookahead (enough for UTF-8)
	as a fairly deep fundamental assumption,
	so it did not support ^ or $.

==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
      cap_[0] = p;
      if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
        return true;
-     if (prog_->flags() & Regexp::UCS2)
-       p++;
    }
    return false;
  }
==== re2/compile.cc#17 - re2/compile.cc#18 ====
re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
  // Input encodings.
  enum Encoding {
    kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
-   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
    kEncodingLatin1,    // Latin1 (0-FF)
  };
  
re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
    void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
    void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
    void Add_80_10ffff();
-   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
-   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
-                    uint8 lo2, uint8 hi2, bool fold2);
  
    // New suffix that matches the byte range lo-hi, then goes to next.
    Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
  
  // Converts rune range lo-hi into a fragment that recognizes
  // the bytes that would make up those runes in the current
- // encoding (Latin 1, UTF-8, or UCS-2).
+ // encoding (Latin 1 or UTF-8).
  // This lets the machine work byte-by-byte even when
  // using multibyte encodings.
  
re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
      case kEncodingLatin1:
        AddRuneRangeLatin1(lo, hi, foldcase);
        break;
-     case kEncodingUCS2:
-       AddRuneRangeUCS2(lo, hi, foldcase);
-       break;
    }
  }
  
re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
    AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
  }
  
- // Test whether 16-bit values are big or little endian.
- static bool BigEndian() {
-   union {
-     char byte[2];
-     int16 endian;
-   } u;
- 
-   u.byte[0] = 1;
-   u.byte[1] = 2;
-   return u.endian == 0x0102;
- }
- 
- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
-                            uint8 lo2, uint8 hi2, bool fold2) {
-   Inst* ip;
-   if (reversed_) {
-     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
-     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
-   } else {
-     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
-     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
-   }
-   AddSuffix(ip);
- }
- 
- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
-   if (lo > hi || lo > 0xFFFF)
-     return;
-   if (hi > 0xFFFF)
-     hi = 0xFFFF;
- 
-   // We'll assemble a pattern assuming big endian.
-   // If the machine isn't, tell Cat to reverse its arguments.
-   bool oldreversed = reversed_;
-   if (!BigEndian()) {
-     reversed_ = !oldreversed;
-   }
- 
-   // Split into bytes.
-   int lo1 = lo >> 8;
-   int lo2 = lo & 0xFF;
-   int hi1 = hi >> 8;
-   int hi2 = hi & 0xFF;
- 
-   if (lo1 == hi1) {
-     // Easy case: high bits are same in both.
-     // Only do ASCII case folding on the second byte if the top byte is 00.
-     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
-   } else {
-     // Harder case: different second byte ranges depending on first byte.
- 
-     // Initial fragment.
-     if (lo2 > 0) {
-       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
-       lo1++;
-     }
- 
-     // Trailing fragment.
-     if (hi2 < 0xFF) {
-       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
-       hi1--;
-     }
- 
-     // Inner ranges.
-     if (lo1 <= hi1) {
-       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
-     }
-   }
- 
-   // Restore reverse setting.
-   reversed_ = oldreversed;
- }
- 
  // Table describing how to make a UTF-8 matching machine
  // for the rune range 80-10FFFF (Runeself-Runemax).
  // This range happens frequently enough (for example /./ and /[^a-z]/)
re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
  
  Frag Compiler::Literal(Rune r, bool foldcase) {
    switch (encoding_) {
-     default:  // UCS-2 or something new
-       BeginRange();
-       AddRuneRange(r, r, foldcase);
-       return EndRange();
+     default:
+       return kNullFrag;
  
      case kEncodingLatin1:
        return ByteRange(r, r, foldcase);
re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
  
    if (re->parse_flags() & Regexp::Latin1)
      c.encoding_ = kEncodingLatin1;
-   else if (re->parse_flags() & Regexp::UCS2)
-     c.encoding_ = kEncodingUCS2;
    c.reversed_ = reversed;
    if (max_mem <= 0) {
      c.max_inst_ = 100000;  // more than enough
re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
      c.prog_->set_start_unanchored(c.prog_->start());
    } else {
      Frag dot;
-     if (c.encoding_ == kEncodingUCS2) {
-       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
-     } else {
-       dot = c.ByteRange(0x00, 0xFF, false);
-     }
+     dot = c.ByteRange(0x00, 0xFF, false);
      Frag dotloop = c.Star(dot, true);
      Frag unanchored = c.Cat(dotloop, all);
      c.prog_->set_start_unanchored(unanchored.begin);
==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
    const char* bp = context.begin();
    int c = -1;
    int wasword = 0;
-   bool ucs2 = prog_->flags() & Regexp::UCS2;
  
    if (text.begin() > context.begin()) {
      c = text.begin()[-1] & 0xFF;
re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
        // If there's a required first byte for an unanchored search
        // and we're not in the middle of any possible matches,
        // use memchr to search for the byte quickly.
-       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
+       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
            p < text.end() && (p[0] & 0xFF) != first_byte_) {
          p = reinterpret_cast<const char*>(memchr(p, first_byte_,
                                                   text.end() - p));
re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
          flag = Prog::EmptyFlags(context, p);
        }
  
-       // In UCS-2 mode, if we need to start a new thread,
-       // make sure to do it on an even boundary.
-       if(ucs2 && runq->size() == 0 &&
-           (p - context.begin()) % 2 && p < text.end()) {
-         p++;
-         flag = Prog::EmptyFlags(context, p);
-       }
- 
        // Steal match storage (cleared but unused as of yet)
        // temporarily to hold match boundaries for new thread.
-       // In UCS-2 mode, only start the thread on a 2-byte boundary.
-       if(!ucs2 || (p - context.begin()) % 2 == 0) {
-         match_[0] = p;
-         AddToThreadq(runq, start_, flag, p, match_);
-         match_[0] = NULL;
-       }
+       match_[0] = p;
+       AddToThreadq(runq, start_, flag, p, match_);
+       match_[0] = NULL;
      }
  
      // If all the threads have died, stop early.
==== re2/parse.cc#22 - re2/parse.cc#23 ====
re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
      status_(status), stacktop_(NULL), ncap_(0) {
    if (flags_ & Latin1)
      rune_max_ = 0xFF;
-   else if (flags & UCS2)
-     rune_max_ = 0xFFFF;
    else
      rune_max_ = Runemax;
  }
re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
  bool Regexp::ParseState::PushCarat() {
    if (flags_ & OneLine) {
      return PushSimpleOp(kRegexpBeginText);
-   } else {
-     if (flags_ & UCS2) {
-       status_->set_code(kRegexpUnsupported);
-       status_->set_error_arg("multiline ^ in UCS-2 mode");
-       return false;
-     }
-     return PushSimpleOp(kRegexpBeginLine);
    }
+   return PushSimpleOp(kRegexpBeginLine);
  }
  
  // Pushes a \b or \B onto the stack.
  bool Regexp::ParseState::PushWordBoundary(bool word) {
-   if (flags_ & UCS2) {
-     status_->set_code(kRegexpUnsupported);
-     status_->set_error_arg("\\b or \\B in UCS-2 mode");
-     return false;
-   }
    if (word)
      return PushSimpleOp(kRegexpWordBoundary);
    return PushSimpleOp(kRegexpNoWordBoundary);
re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
      bool ret = PushSimpleOp(kRegexpEndText);
      flags_ = oflags;
      return ret;
-   }
-   if (flags_ & UCS2) {
-     status_->set_code(kRegexpUnsupported);
-     status_->set_error_arg("multiline $ in UCS-2 mode");
-     return false;
    }
    return PushSimpleOp(kRegexpEndLine);
  }
==== re2/re2.cc#34 - re2/re2.cc#35 ====
re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
        return RE2::ErrorBadUTF8;
      case re2::kRegexpBadNamedCapture:
        return RE2::ErrorBadNamedCapture;
-     case re2::kRegexpUnsupported:
-       return RE2::ErrorUnsupported;
    }
    return RE2::ErrorInternal;
  }
re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
        break;
      case RE2::Options::EncodingLatin1:
        flags |= Regexp::Latin1;
-       break;
-     case RE2::Options::EncodingUCS2:
-       flags |= Regexp::UCS2;
        break;
    }
  
==== re2/re2.h#36 - re2/re2.h#37 ====
re2/re2.h#36:246,252 - re2/re2.h#37:246,251
      ErrorBadUTF8,            // invalid UTF-8 in regexp
      ErrorBadNamedCapture,    // bad named capture group
      ErrorPatternTooLarge,    // pattern too large (compile failed)
-     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
    };
  
    // Predefined common options.
re2/re2.h#36:570,576 - re2/re2.h#37:569,574
  
      enum Encoding {
        EncodingUTF8 = 1,
-       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
        EncodingLatin1
      };
  
==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
  // the regexp that remains after the prefix.  The prefix might
  // be ASCII case-insensitive.
  bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
-   // Don't even bother for UCS-2; it's time to throw that code away.
-   if (parse_flags_ & UCS2)
-     return false;
- 
    // No need for a walker: the regexp must be of the form
    // 1. some number of ^ anchors
    // 2. a literal char or string
==== re2/regexp.h#20 - re2/regexp.h#21 ====
re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
    kRegexpBadPerlOp,          // bad perl operator
    kRegexpBadUTF8,            // invalid UTF-8 in regexp
    kRegexpBadNamedCapture,    // bad named capture
-   kRegexpUnsupported,        // unsupported operator
  };
  
  // Error status for certain operations.
re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
                             //   \Q and \E to disable/enable metacharacters
                             //   (?P<name>expr) for named captures
                             //   \C to match any single byte
-     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
-     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
+     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
                             //   and \P{Han} for its negation.
-     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
+     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
                             //   it explicitly.
  
      // As close to Perl as we can get.
==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
      cap_[0] = p;
      if (Visit(prog_->start(), p))  // Match must be leftmost; done.
        return true;
-     if (prog_->flags() & Regexp::UCS2)
-       p++;
    }
    return false;
  }
==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
  static ParseMode parse_modes[] = {
    { single_line,                   "single-line"          },
    { single_line|Regexp::Latin1,    "single-line, latin1"  },
-   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
    { multi_line,                    "multiline"            },
    { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
    { multi_line|Regexp::Latin1,     "multiline, latin1"    },
-   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
  };
  
  static string FormatMode(Regexp::ParseFlags flags) {
re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
    RegexpStatus status;
    regexp_ = Regexp::Parse(regexp_str, flags, &status);
    if (regexp_ == NULL) {
-     if (status.code() != kRegexpUnsupported) {
-       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
-                 << " mode: " << FormatMode(flags);
-       error_ = true;
-     }
+     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+               << " mode: " << FormatMode(flags);
+     error_ = true;
      return;
    }
    prog_ = regexp_->CompileToProg(0);
re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
      RE2::Options options;
      if (flags & Regexp::Latin1)
        options.set_encoding(RE2::Options::EncodingLatin1);
-     else if (flags & Regexp::UCS2)
-       options.set_encoding(RE2::Options::EncodingUCS2);
      if (kind_ == Prog::kLongestMatch)
        options.set_longest_match(true);
      re2_ = new RE2(re, options);
re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
      delete re2_;
  }
  
- // Converts UTF-8 string in text into UCS-2 string in new_text.
- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
-   const char* p = text.begin();
-   const char* ep = text.end();
-   uint16* q = new uint16[ep - p];
-   uint16* q0 = q;
- 
-   int n;
-   Rune r;
-   for (; p < ep; p += n) {
-     if (!fullrune(p, ep - p)) {
-       delete[] q0;
-       return false;
-     }
-     n = chartorune(&r, p);
-     if (r > 0xFFFF) {
-       delete[] q0;
-       return false;
-     }
-     *q++ = r;
-   }
-   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
-   return true;
- }
- 
- // Rewrites *sp from being a pointer into text8 (UTF-8)
- // to being a pointer into text16 (equivalent text but in UCS-2).
- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
-                               StringPiece *sp) {
-   if (sp->begin() == NULL && text8.begin() != NULL)
-     return;
- 
-   int nrune = 0;
-   int n;
-   Rune r;
-   const char* p = text8.begin();
-   const char* ep = text8.end();
-   const char* spbegin = NULL;
-   const char* spend = NULL;
-   for (;;) {
-     if (p == sp->begin())
-       spbegin = text16.begin() + sizeof(uint16)*nrune;
-     if (p == sp->end())
-       spend = text16.begin() + sizeof(uint16)*nrune;
-     if (p >= ep)
-       break;
-     n = chartorune(&r, p);
-     p += n;
-     nrune++;
-   }
-   if (spbegin == NULL || spend == NULL) {
-     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
-                << CEscape(text8) << " "
-                << (int)(sp->begin() - text8.begin()) << " "
-                << (int)(sp->end() - text8.begin());
-   }
-   *sp = StringPiece(spbegin, spend - spbegin);
- }
- 
- // Rewrites *sp from begin a pointer into text16 (UCS-2)
- // to being a pointer into text8 (equivalent text but in UTF-8).
- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
-                               StringPiece* sp) {
-   if (sp->begin() == NULL)
-     return;
- 
-   int nrune = 0;
-   int n;
-   Rune r;
-   const char* p = text8.begin();
-   const char* ep = text8.end();
-   const char* spbegin = NULL;
-   const char* spend = NULL;
-   for (;;) {
-     if (nrune == (sp->begin() - text16.begin())/2)
-       spbegin = p;
-     if (nrune == (sp->end() - text16.begin())/2)
-       spend = p;
-     if (p >= ep)
-       break;
-     n = chartorune(&r, p);
-     p += n;
-     nrune++;
-   }
-   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
-     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
-                << CEscape(text16) << " "
-                << (int)(sp->begin() - text16.begin()) << " "
-                << (int)(sp->end() - text16.begin());
-   }
-   *sp = StringPiece(spbegin, spend - spbegin);
- }
- 
  // Runs a single search using the named engine type.
  // This interface hides all the irregularities of the various
  // engine interfaces from the rest of this file.
re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
  
    StringPiece text = orig_text;
    StringPiece context = orig_context;
-   bool ucs2 = false;
  
-   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
-     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
-       result->skipped = true;
-       return;
-     }
- 
-     // Rewrite context to refer to new text.
-     AdjustUTF8ToUCS2(orig_context, context, &text);
-     ucs2 = true;
-   }
- 
    switch (type) {
      default:
        LOG(FATAL) << "Bad RunSearch type: " << (int)type;
re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
      }
    }
  
-   // If we did UCS-2 matching, rewrite the matches to refer
-   // to the original UTF-8 text.
-   if (ucs2) {
-     if (result->matched) {
-       if (result->have_submatch0) {
-         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
-       } else if (result->have_submatch) {
-         for (int i = 0; i < nsubmatch; i++) {
-           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
-         }
-       }
-     }
-     delete[] context.begin();
-   }
- 
    if (!result->matched)
      memset(result->submatch, 0, sizeof result->submatch);
  }
re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
    return true;
  }
  
- // Check whether text uses only Unicode points <= 0xFFFF
- // (in the BMP).
- static bool IsBMP(const StringPiece& text) {
-   const char* p = text.begin();
-   const char* ep = text.end();
-   while (p < ep) {
-     if (!fullrune(p, ep - p))
-       return false;
-     Rune r;
-     p += chartorune(&r, p);
-     if (r > 0xFFFF)
-       return false;
-   }
-   return true;
- }
- 
  // Runs a single test.
  bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
                             Prog::Anchor anchor) {
re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
    Result correct;
    RunSearch(kEngineBacktrack, text, context, anchor, &correct);
    if (correct.skipped) {
-     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
+     if (regexp_ == NULL)
        return true;
      LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
                 << " " << FormatMode(flags_);
Add 'outside/re2/' from commit '539b44fc4c5a49c3453b80e3af85d297f4cab4bf' git-subtree-dir: outside/re2 git-subtree-mainline: f94738bfd171ae447133e0964843addbb497894f git-subtree-split: 539b44fc4c5a49c3453b80e3af85d297f4cab4bf 2014-04-10 22:36:47 +04:00			`This is a dump from Google's source control system of the change`
			`that removed UCS-2 support from RE2. As the explanation below`
			`says, UCS-2 mode is fundamentally at odds with things like ^ and $,`
			`so it never really worked very well. But if you are interested in using`
			`it without those operators, it did work for that. It assumed that the`
			`UCS-2 data was in the native host byte order.`

			`If you are interested in adding UCS-2 mode back, this patch might`
			`be a good starting point.`


			`Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15`

			`Retire UCS-2 mode.`

			`I added it as an experiment for V8, but it`
			`requires 2-byte lookahead to do completely,`
			`and RE2 has 1-byte lookahead (enough for UTF-8)`
			`as a fairly deep fundamental assumption,`
			`so it did not support ^ or $.`

			`==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====`
			`re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319`
			`cap_[0] = p;`
			`if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.`
			`return true;`
			`- if (prog_->flags() & Regexp::UCS2)`
			`- p++;`
			`}`
			`return false;`
			`}`
			`==== re2/compile.cc#17 - re2/compile.cc#18 ====`
			`re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100`
			`// Input encodings.`
			`enum Encoding {`
			`kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)`
			`- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order`
			`kEncodingLatin1, // Latin1 (0-FF)`
			`};`

			`re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172`
			`void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);`
			`void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);`
			`void Add_80_10ffff();`
			`- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);`
			`- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,`
			`- uint8 lo2, uint8 hi2, bool fold2);`

			`// New suffix that matches the byte range lo-hi, then goes to next.`
			`Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);`
			`re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477`

			`// Converts rune range lo-hi into a fragment that recognizes`
			`// the bytes that would make up those runes in the current`
			`- // encoding (Latin 1, UTF-8, or UCS-2).`
			`+ // encoding (Latin 1 or UTF-8).`
			`// This lets the machine work byte-by-byte even when`
			`// using multibyte encodings.`

			`re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489`
			`case kEncodingLatin1:`
			`AddRuneRangeLatin1(lo, hi, foldcase);`
			`break;`
			`- case kEncodingUCS2:`
			`- AddRuneRangeUCS2(lo, hi, foldcase);`
			`- break;`
			`}`
			`}`

			`re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501`
			`AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));`
			`}`

			`- // Test whether 16-bit values are big or little endian.`
			`- static bool BigEndian() {`
			`- union {`
			`- char byte[2];`
			`- int16 endian;`
			`- } u;`
			`-`
			`- u.byte[0] = 1;`
			`- u.byte[1] = 2;`
			`- return u.endian == 0x0102;`
			`- }`
			`-`
			`- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,`
			`- uint8 lo2, uint8 hi2, bool fold2) {`
			`- Inst* ip;`
			`- if (reversed_) {`
			`- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);`
			`- ip = RuneByteSuffix(lo2, hi2, fold2, ip);`
			`- } else {`
			`- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);`
			`- ip = RuneByteSuffix(lo1, hi1, fold1, ip);`
			`- }`
			`- AddSuffix(ip);`
			`- }`
			`-`
			`- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {`
			`- if (lo > hi \|\| lo > 0xFFFF)`
			`- return;`
			`- if (hi > 0xFFFF)`
			`- hi = 0xFFFF;`
			`-`
			`- // We'll assemble a pattern assuming big endian.`
			`- // If the machine isn't, tell Cat to reverse its arguments.`
			`- bool oldreversed = reversed_;`
			`- if (!BigEndian()) {`
			`- reversed_ = !oldreversed;`
			`- }`
			`-`
			`- // Split into bytes.`
			`- int lo1 = lo >> 8;`
			`- int lo2 = lo & 0xFF;`
			`- int hi1 = hi >> 8;`
			`- int hi2 = hi & 0xFF;`
			`-`
			`- if (lo1 == hi1) {`
			`- // Easy case: high bits are same in both.`
			`- // Only do ASCII case folding on the second byte if the top byte is 00.`
			`- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);`
			`- } else {`
			`- // Harder case: different second byte ranges depending on first byte.`
			`-`
			`- // Initial fragment.`
			`- if (lo2 > 0) {`
			`- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);`
			`- lo1++;`
			`- }`
			`-`
			`- // Trailing fragment.`
			`- if (hi2 < 0xFF) {`
			`- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);`
			`- hi1--;`
			`- }`
			`-`
			`- // Inner ranges.`
			`- if (lo1 <= hi1) {`
			`- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);`
			`- }`
			`- }`
			`-`
			`- // Restore reverse setting.`
			`- reversed_ = oldreversed;`
			`- }`
			`-`
			`// Table describing how to make a UTF-8 matching machine`
			`// for the rune range 80-10FFFF (Runeself-Runemax).`
			`// This range happens frequently enough (for example /./ and /[^a-z]/)`
			`re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634`

			`Frag Compiler::Literal(Rune r, bool foldcase) {`
			`switch (encoding_) {`
			`- default: // UCS-2 or something new`
			`- BeginRange();`
			`- AddRuneRange(r, r, foldcase);`
			`- return EndRange();`
			`+ default:`
			`+ return kNullFrag;`

			`case kEncodingLatin1:`
			`return ByteRange(r, r, foldcase);`
			`re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850`

			`if (re->parse_flags() & Regexp::Latin1)`
			`c.encoding_ = kEncodingLatin1;`
			`- else if (re->parse_flags() & Regexp::UCS2)`
			`- c.encoding_ = kEncodingUCS2;`
			`c.reversed_ = reversed;`
			`if (max_mem <= 0) {`
			`c.max_inst_ = 100000; // more than enough`
			`re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905`
			`c.prog_->set_start_unanchored(c.prog_->start());`
			`} else {`
			`Frag dot;`
			`- if (c.encoding_ == kEncodingUCS2) {`
			`- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));`
			`- } else {`
			`- dot = c.ByteRange(0x00, 0xFF, false);`
			`- }`
			`+ dot = c.ByteRange(0x00, 0xFF, false);`
			`Frag dotloop = c.Star(dot, true);`
			`Frag unanchored = c.Cat(dotloop, all);`
			`c.prog_->set_start_unanchored(unanchored.begin);`
			`==== re2/nfa.cc#8 - re2/nfa.cc#9 ====`
			`re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431`
			`const char* bp = context.begin();`
			`int c = -1;`
			`int wasword = 0;`
			`- bool ucs2 = prog_->flags() & Regexp::UCS2;`

			`if (text.begin() > context.begin()) {`
			`c = text.begin()[-1] & 0xFF;`
			`re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497`
			`// If there's a required first byte for an unanchored search`
			`// and we're not in the middle of any possible matches,`
			`// use memchr to search for the byte quickly.`
			`- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&`
			`+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&`
			`p < text.end() && (p[0] & 0xFF) != first_byte_) {`
			`p = reinterpret_cast<const char*>(memchr(p, first_byte_,`
			`text.end() - p));`
			`re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514`
			`flag = Prog::EmptyFlags(context, p);`
			`}`

			`- // In UCS-2 mode, if we need to start a new thread,`
			`- // make sure to do it on an even boundary.`
			`- if(ucs2 && runq->size() == 0 &&`
			`- (p - context.begin()) % 2 && p < text.end()) {`
			`- p++;`
			`- flag = Prog::EmptyFlags(context, p);`
			`- }`
			`-`
			`// Steal match storage (cleared but unused as of yet)`
			`// temporarily to hold match boundaries for new thread.`
			`- // In UCS-2 mode, only start the thread on a 2-byte boundary.`
			`- if(!ucs2 \|\| (p - context.begin()) % 2 == 0) {`
			`- match_[0] = p;`
			`- AddToThreadq(runq, start_, flag, p, match_);`
			`- match_[0] = NULL;`
			`- }`
			`+ match_[0] = p;`
			`+ AddToThreadq(runq, start_, flag, p, match_);`
			`+ match_[0] = NULL;`
			`}`

			`// If all the threads have died, stop early.`
			`==== re2/parse.cc#22 - re2/parse.cc#23 ====`
			`re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165`
			`status_(status), stacktop_(NULL), ncap_(0) {`
			`if (flags_ & Latin1)`
			`rune_max_ = 0xFF;`
			`- else if (flags & UCS2)`
			`- rune_max_ = 0xFFFF;`
			`else`
			`rune_max_ = Runemax;`
			`}`
			`re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374`
			`bool Regexp::ParseState::PushCarat() {`
			`if (flags_ & OneLine) {`
			`return PushSimpleOp(kRegexpBeginText);`
			`- } else {`
			`- if (flags_ & UCS2) {`
			`- status_->set_code(kRegexpUnsupported);`
			`- status_->set_error_arg("multiline ^ in UCS-2 mode");`
			`- return false;`
			`- }`
			`- return PushSimpleOp(kRegexpBeginLine);`
			`}`
			`+ return PushSimpleOp(kRegexpBeginLine);`
			`}`

			`// Pushes a \b or \B onto the stack.`
			`bool Regexp::ParseState::PushWordBoundary(bool word) {`
			`- if (flags_ & UCS2) {`
			`- status_->set_code(kRegexpUnsupported);`
			`- status_->set_error_arg("\\b or \\B in UCS-2 mode");`
			`- return false;`
			`- }`
			`if (word)`
			`return PushSimpleOp(kRegexpWordBoundary);`
			`return PushSimpleOp(kRegexpNoWordBoundary);`
			`re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389`
			`bool ret = PushSimpleOp(kRegexpEndText);`
			`flags_ = oflags;`
			`return ret;`
			`- }`
			`- if (flags_ & UCS2) {`
			`- status_->set_code(kRegexpUnsupported);`
			`- status_->set_error_arg("multiline $ in UCS-2 mode");`
			`- return false;`
			`}`
			`return PushSimpleOp(kRegexpEndLine);`
			`}`
			`==== re2/re2.cc#34 - re2/re2.cc#35 ====`
			`re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84`
			`return RE2::ErrorBadUTF8;`
			`case re2::kRegexpBadNamedCapture:`
			`return RE2::ErrorBadNamedCapture;`
			`- case re2::kRegexpUnsupported:`
			`- return RE2::ErrorUnsupported;`
			`}`
			`return RE2::ErrorInternal;`
			`}`
			`re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125`
			`break;`
			`case RE2::Options::EncodingLatin1:`
			`flags \|= Regexp::Latin1;`
			`- break;`
			`- case RE2::Options::EncodingUCS2:`
			`- flags \|= Regexp::UCS2;`
			`break;`
			`}`

			`==== re2/re2.h#36 - re2/re2.h#37 ====`
			`re2/re2.h#36:246,252 - re2/re2.h#37:246,251`
			`ErrorBadUTF8, // invalid UTF-8 in regexp`
			`ErrorBadNamedCapture, // bad named capture group`
			`ErrorPatternTooLarge, // pattern too large (compile failed)`
			`- ErrorUnsupported, // unsupported feature (in UCS-2 mode)`
			`};`

			`// Predefined common options.`
			`re2/re2.h#36:570,576 - re2/re2.h#37:569,574`

			`enum Encoding {`
			`EncodingUTF8 = 1,`
			`- EncodingUCS2, // 16-bit Unicode 0-FFFF only`
			`EncodingLatin1`
			`};`

			`==== re2/regexp.cc#15 - re2/regexp.cc#16 ====`
			`re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329`
			`// the regexp that remains after the prefix. The prefix might`
			`// be ASCII case-insensitive.`
			`bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {`
			`- // Don't even bother for UCS-2; it's time to throw that code away.`
			`- if (parse_flags_ & UCS2)`
			`- return false;`
			`-`
			`// No need for a walker: the regexp must be of the form`
			`// 1. some number of ^ anchors`
			`// 2. a literal char or string`
			`==== re2/regexp.h#20 - re2/regexp.h#21 ====`
			`re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192`
			`kRegexpBadPerlOp, // bad perl operator`
			`kRegexpBadUTF8, // invalid UTF-8 in regexp`
			`kRegexpBadNamedCapture, // bad named capture`
			`- kRegexpUnsupported, // unsupported operator`
			`};`

			`// Error status for certain operations.`
			`re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314`
			`// \Q and \E to disable/enable metacharacters`
			`// (?P<name>expr) for named captures`
			`// \C to match any single byte`
			`- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.`
			`- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group`
			`+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group`
			`// and \P{Han} for its negation.`
			`- NeverNL = 1<<12, // Never match NL, even if the regexp mentions`
			`+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions`
			`// it explicitly.`

			`// As close to Perl as we can get.`
			`==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====`
			`re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139`
			`cap_[0] = p;`
			`if (Visit(prog_->start(), p)) // Match must be leftmost; done.`
			`return true;`
			`- if (prog_->flags() & Regexp::UCS2)`
			`- p++;`
			`}`
			`return false;`
			`}`
			`==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====`
			`re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152`
			`static ParseMode parse_modes[] = {`
			`{ single_line, "single-line" },`
			`{ single_line\|Regexp::Latin1, "single-line, latin1" },`
			`- { single_line\|Regexp::UCS2, "single-line, ucs2" },`
			`{ multi_line, "multiline" },`
			`{ multi_line\|Regexp::NonGreedy, "multiline, nongreedy" },`
			`{ multi_line\|Regexp::Latin1, "multiline, latin1" },`
			`- { multi_line\|Regexp::UCS2, "multiline, ucs2" },`
			`};`

			`static string FormatMode(Regexp::ParseFlags flags) {`
			`re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185`
			`RegexpStatus status;`
			`regexp_ = Regexp::Parse(regexp_str, flags, &status);`
			`if (regexp_ == NULL) {`
			`- if (status.code() != kRegexpUnsupported) {`
			`- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)`
			`- << " mode: " << FormatMode(flags);`
			`- error_ = true;`
			`- }`
			`+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)`
			`+ << " mode: " << FormatMode(flags);`
			`+ error_ = true;`
			`return;`
			`}`
			`prog_ = regexp_->CompileToProg(0);`
			`re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231`
			`RE2::Options options;`
			`if (flags & Regexp::Latin1)`
			`options.set_encoding(RE2::Options::EncodingLatin1);`
			`- else if (flags & Regexp::UCS2)`
			`- options.set_encoding(RE2::Options::EncodingUCS2);`
			`if (kind_ == Prog::kLongestMatch)`
			`options.set_longest_match(true);`
			`re2_ = new RE2(re, options);`
			`re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280`
			`delete re2_;`
			`}`

			`- // Converts UTF-8 string in text into UCS-2 string in new_text.`
			`- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {`
			`- const char* p = text.begin();`
			`- const char* ep = text.end();`
			`- uint16* q = new uint16[ep - p];`
			`- uint16* q0 = q;`
			`-`
			`- int n;`
			`- Rune r;`
			`- for (; p < ep; p += n) {`
			`- if (!fullrune(p, ep - p)) {`
			`- delete[] q0;`
			`- return false;`
			`- }`
			`- n = chartorune(&r, p);`
			`- if (r > 0xFFFF) {`
			`- delete[] q0;`
			`- return false;`
			`- }`
			`- *q++ = r;`
			`- }`
			`- new_text = StringPiece(reinterpret_cast<char>(q0), 2*(q - q0));`
			`- return true;`
			`- }`
			`-`
			`- // Rewrites *sp from being a pointer into text8 (UTF-8)`
			`- // to being a pointer into text16 (equivalent text but in UCS-2).`
			`- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,`
			`- StringPiece *sp) {`
			`- if (sp->begin() == NULL && text8.begin() != NULL)`
			`- return;`
			`-`
			`- int nrune = 0;`
			`- int n;`
			`- Rune r;`
			`- const char* p = text8.begin();`
			`- const char* ep = text8.end();`
			`- const char* spbegin = NULL;`
			`- const char* spend = NULL;`
			`- for (;;) {`
			`- if (p == sp->begin())`
			`- spbegin = text16.begin() + sizeof(uint16)*nrune;`
			`- if (p == sp->end())`
			`- spend = text16.begin() + sizeof(uint16)*nrune;`
			`- if (p >= ep)`
			`- break;`
			`- n = chartorune(&r, p);`
			`- p += n;`
			`- nrune++;`
			`- }`
			`- if (spbegin == NULL \|\| spend == NULL) {`
			`- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "`
			`- << CEscape(text8) << " "`
			`- << (int)(sp->begin() - text8.begin()) << " "`
			`- << (int)(sp->end() - text8.begin());`
			`- }`
			`- *sp = StringPiece(spbegin, spend - spbegin);`
			`- }`
			`-`
			`- // Rewrites *sp from begin a pointer into text16 (UCS-2)`
			`- // to being a pointer into text8 (equivalent text but in UTF-8).`
			`- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,`
			`- StringPiece* sp) {`
			`- if (sp->begin() == NULL)`
			`- return;`
			`-`
			`- int nrune = 0;`
			`- int n;`
			`- Rune r;`
			`- const char* p = text8.begin();`
			`- const char* ep = text8.end();`
			`- const char* spbegin = NULL;`
			`- const char* spend = NULL;`
			`- for (;;) {`
			`- if (nrune == (sp->begin() - text16.begin())/2)`
			`- spbegin = p;`
			`- if (nrune == (sp->end() - text16.begin())/2)`
			`- spend = p;`
			`- if (p >= ep)`
			`- break;`
			`- n = chartorune(&r, p);`
			`- p += n;`
			`- nrune++;`
			`- }`
			`- if (text8.begin() != NULL && (spbegin == NULL \|\| spend == NULL)) {`
			`- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "`
			`- << CEscape(text16) << " "`
			`- << (int)(sp->begin() - text16.begin()) << " "`
			`- << (int)(sp->end() - text16.begin());`
			`- }`
			`- *sp = StringPiece(spbegin, spend - spbegin);`
			`- }`
			`-`
			`// Runs a single search using the named engine type.`
			`// This interface hides all the irregularities of the various`
			`// engine interfaces from the rest of this file.`
			`re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300`

			`StringPiece text = orig_text;`
			`StringPiece context = orig_context;`
			`- bool ucs2 = false;`

			`- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {`
			`- if (!ConvertUTF8ToUCS2(orig_context, &context)) {`
			`- result->skipped = true;`
			`- return;`
			`- }`
			`-`
			`- // Rewrite context to refer to new text.`
			`- AdjustUTF8ToUCS2(orig_context, context, &text);`
			`- ucs2 = true;`
			`- }`
			`-`
			`switch (type) {`
			`default:`
			`LOG(FATAL) << "Bad RunSearch type: " << (int)type;`
			`re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451`
			`}`
			`}`

			`- // If we did UCS-2 matching, rewrite the matches to refer`
			`- // to the original UTF-8 text.`
			`- if (ucs2) {`
			`- if (result->matched) {`
			`- if (result->have_submatch0) {`
			`- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);`
			`- } else if (result->have_submatch) {`
			`- for (int i = 0; i < nsubmatch; i++) {`
			`- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);`
			`- }`
			`- }`
			`- }`
			`- delete[] context.begin();`
			`- }`
			`-`
			`if (!result->matched)`
			`memset(result->submatch, 0, sizeof result->submatch);`
			`}`
			`re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475`
			`return true;`
			`}`

			`- // Check whether text uses only Unicode points <= 0xFFFF`
			`- // (in the BMP).`
			`- static bool IsBMP(const StringPiece& text) {`
			`- const char* p = text.begin();`
			`- const char* ep = text.end();`
			`- while (p < ep) {`
			`- if (!fullrune(p, ep - p))`
			`- return false;`
			`- Rune r;`
			`- p += chartorune(&r, p);`
			`- if (r > 0xFFFF)`
			`- return false;`
			`- }`
			`- return true;`
			`- }`
			`-`
			`// Runs a single test.`
			`bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,`
			`Prog::Anchor anchor) {`
			`re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483`
			`Result correct;`
			`RunSearch(kEngineBacktrack, text, context, anchor, &correct);`
			`if (correct.skipped) {`
			`- if (regexp_ == NULL \|\| !IsBMP(context)) // okay to skip in UCS-2 mode`
			`+ if (regexp_ == NULL)`
			`return true;`
			`LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)`
			`<< " " << FormatMode(flags_);`
No results found.