Improve handling HTML special cases (#312)

- Prefer spreading markup over a full word. - Ignore certain tags that are unlikely to be supposed to be translated, such as `<code>` and `<samp>`. - Never treat `<wbr>` as a space. - Allow for inconsistent cases in tag names. - Fix bug where void elements were inserted multiple times. - Better handling of whitespace around punctuation. - Ignore parsing `<noscript>` to be compatible with Firefox. - Improvements to documentation and readability of `HTML` and `Scanner` classes. Fixes: #313, #339
2024-09-11 05:35:33 +03:00 · 2022-02-22 20:25:34 +00:00 · 2022-02-22 20:25:34 +00:00 · 1f98f971a5
commit 1f98f971a5
parent 9eb243725b
8 changed files with 530 additions and 170 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 3c0f95a1775a74f5db441aa2f17ceb7437679022
+Subproject commit 3776609ce5f7a238245e303efaa007b2d5078180
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@ -172,6 +172,16 @@ TEST_CASE("Do not abort if the input is just empty element") {
  CHECK(response.target.text == "<p></p>");
 }

+TEST_CASE("Tag names are case insensitive") {
+  // Tests <P> vs </p> and <BR> should be recognized as a void tag <br>.
+  // <B> should be recognized as inline.
+  std::string test_str("<P><B>Spa</B>ce<BR>please?</p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "Spa ce\n\nplease?");
+}
+
 TEST_CASE("Test case html entities") {
  // These are all entities I would expect in innerHTML, since all other entities
  // can be encoded as UTF-8 so there's no need to encode them through &...; when
@ -618,6 +628,72 @@ TEST_CASE("Test comment") {
  CHECK(response.target.text == test_str);
 }

+TEST_CASE("Test <wbr> element") {
+  std::string test_str("hel<wbr>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test <wbr> element (case-insensitive)") {
+  std::string test_str("hel<WBR>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test ignored element (nested)") {
+  std::string test_str("foo <var><var>nested</var></var> bar");
+  std::string expected_str("foo  <var><var>nested</var></var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
+TEST_CASE("Test ignored element (with entity)") {
+  std::string test_str("foo <var>&amp;</var> bar");
+  std::string expected_str("foo  <var>&amp;</var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
 TEST_CASE("End-to-end translation", "[!mayfail]") {
  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
  HTML html(std::move(input), true);
--- a/src/translator/annotation.h
+++ b/src/translator/annotation.h
@ -185,6 +185,41 @@ struct AnnotatedText {
  /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
  ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }

+  /// Utility function to call `fun` on each word (subword token effectively) in
+  /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
+  /// with the word, and a `bool` to indicate whether it is the last word in the
+  /// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
+  template <typename Fun>
+  AnnotatedText apply(Fun fun) const {
+    AnnotatedText out;
+
+    for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
+      std::string sentence;
+      std::vector<ByteRange> tokens;
+
+      std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
+
+      for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
+        std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
+        tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
+        sentence += token;
+      }
+
+      // Convert our ByteRanges to string_views since that's what appendSentence
+      // expects
+      std::vector<marian::string_view> views(tokens.size());
+      std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
+        return marian::string_view(sentence.data() + range.begin, range.size());
+      });
+
+      out.appendSentence(prefix, views.begin(), views.end());
+    }
+
+    out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
+
+    return out;
+  }
+
 private:
  string_view asStringView(const ByteRange &byteRange) const {
    return string_view(text.data() + byteRange.begin, byteRange.size());
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@ -1,21 +1,23 @@
 #include "html.h"

+#include <algorithm>
+
 #include "response.h"
 #include "xh_scanner.h"

 namespace {
-using marian::string_view;
 using marian::bergamot::AnnotatedText;
 using marian::bergamot::ByteRange;
 using marian::bergamot::HTML;
 using marian::bergamot::Response;

-void encodeEntities(string_view const &input, std::string &output) {
+/// Encodes the minimum of HTML entities.
+void encodeEntities(marian::string_view const &input, std::string &output) {
  output.clear();
  output.reserve(input.size());  // assumes there are no entities in most cases

-  for (auto it = input.begin(); it != input.end(); ++it) {
-    switch (*it) {
+  for (char it : input) {
+    switch (it) {
      case '&':
        output.append("&amp;");
        break;
@ -35,19 +37,30 @@ void encodeEntities(string_view const &input, std::string &output) {
      //   output.append("&apos;");
      //   break;
      default:
-        output.push_back(*it);
+        output.push_back(it);
        break;
    }
  }
 }

-size_t countPrefixWhitespaces(string_view const &input) {
+/// Counts number of whitespace characters at the start of the input. Used
+/// for determining where to insert an open or close tag.
+size_t countPrefixWhitespaces(marian::string_view const &input) {
  size_t size = 0;
  while (size < input.size() && std::isspace(input[size])) ++size;
  return size;
 }

-// Very simple replacement for std::format introduced in C++20
+std::string toLowerCase(std::string_view const &input) {
+  std::string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
+/// Very simple replacement for std::format introduced in C++20. Only supports
+/// replacing `{}` in the template string with whatever `operator<<` for that
+/// type turns it into.
 std::string format(std::string const &formatTemplate) { return formatTemplate; }

 template <typename Arg>
@ -68,14 +81,14 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
  return os.str();
 }

-// Syntactic sugar around rbegin() and rend() that allows me to write
-// `for (auto &&item : reversed(container))` instead of the needlessly verbose
-// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
+/// Syntactic sugar around rbegin() and rend() that allows me to write
+/// `for (auto &&item : reversed(container))` instead of the needlessly verbose
+/// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
 template <typename T>
-class reversed {
+class Reversed {
 public:
-  typedef typename T::const_reverse_iterator iterator;
-  explicit reversed(T const &container) : container_(container){};
+  using iterator = typename T::const_reverse_iterator;
+  explicit Reversed(T const &container) : container_(container){};
  iterator begin() const { return container_.rbegin(); }
  iterator end() const { return container_.rend(); }

@ -83,11 +96,10 @@ class reversed {
  T const &container_;
 };

-bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
-  return set.find(name) != set.end();
-}
-
-void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
+/// When comparing two tag stacks, determine which tags need to be closed and
+/// opened to get from one stack to the other.
+void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
+              HTML::TagStack &closing) {
  opening.clear();
  closing.clear();

@ -98,9 +110,11 @@ void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &ope
    if (i >= curr.size() || prev[i] != curr[i]) break;

  // Only nodes of type ELEMENT can have children and thus would need a closing tag.
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
  std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
               [&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });

+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
  opening.insert(opening.end(), curr.begin() + i, curr.end());
 }

@ -108,42 +122,24 @@ bool intersects(ByteRange const &range, HTML::Span const &span) {
  return range.begin <= span.end && range.end >= span.begin;
 };

-bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
+bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
+
+bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
  return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }

-template <typename Fun>
-AnnotatedText apply(AnnotatedText const &in, Fun fun) {
-  AnnotatedText out;
+/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few
+/// more nested deeper.
+bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
+  if (a.size() > b.size()) return false;

-  for (size_t sentenceIdx = 0; sentenceIdx < in.numSentences(); ++sentenceIdx) {
-    std::string sentence;
-    std::vector<ByteRange> tokens;
+  for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
+    if (*i != *j) return false;

-    std::string prefix = fun(in.annotation.gap(sentenceIdx), in.gap(sentenceIdx), false);
-
-    for (size_t wordIdx = 0; wordIdx < in.numWords(sentenceIdx); ++wordIdx) {
-      std::string token = fun(in.wordAsByteRange(sentenceIdx, wordIdx), in.word(sentenceIdx, wordIdx), false);
-      tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
-      sentence += token;
-    }
-
-    // Convert our ByteRanges to string_views since that's what appendSentence
-    // expects
-    // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
-    // directly
-    std::vector<string_view> views(tokens.size());
-    std::transform(tokens.begin(), tokens.end(), views.begin(),
-                   [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); });
-
-    out.appendSentence(prefix, views.begin(), views.end());
-  }
-
-  out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true));
-
-  return out;
+  return true;
 }

+/// Tests whether `response` has alignment info associated with it or not.
 bool hasAlignments(Response const &response) {
  // Test for each sentence individually as a sentence may be empty (or there)
  // might be no sentences, so just testing for alignments.empty() would not be
@ -162,11 +158,12 @@ bool hasAlignments(Response const &response) {
  return true;
 }

-// Little helper class to append HTML to a token
+/// Helper class to append HTML tags to a token. Also makes sure the token is
+/// encoded as valid HTML.
 class TokenFormatter {
 public:
-  explicit TokenFormatter(string_view token)
-      : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+  explicit TokenFormatter(marian::string_view token)
+      : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
    // Do encoding of any entities that popped up in the translation
    encodeEntities(token, html_);
  }
@ -174,12 +171,12 @@ class TokenFormatter {
  std::string &&html() { return std::move(html_); }

  // Append the markup necessary for moving from `prev` set of tags to `curr`.
-  void append(HTML::Taint const &prev, HTML::Taint const &curr) {
-    HTML::Taint opening, closing;
+  void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
+    HTML::TagStack opening, closing;

    diffTags(prev, curr, opening, closing);

-    for (HTML::Tag const *tag : reversed(closing)) {
+    for (HTML::Tag const *tag : Reversed(closing)) {
      assert(tag->type == HTML::Tag::ELEMENT);
      std::string closeTag = format("</{}>", tag->name);
      html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
@ -232,6 +229,8 @@ class TokenFormatter {
  bool closeLeft_;
 };

+/// Count the number of tokens in an AnnotatedText. Used to assert we're not
+/// running out of sync when creating vectors that describe each token.
 size_t debugCountTokens(AnnotatedText const &text) {
  size_t tokens = 1;  // for the ending gap
  for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@ -240,11 +239,87 @@ size_t debugCountTokens(AnnotatedText const &text) {
  return tokens;
 }

+/// Helper function that consumes a tag as if it is a special tag, except that
+/// it takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
+// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
+/// to determine whether this was an element that needed to be ignored.
+void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
+  // Only full elements can be consumed this way. With void tags we don't know
+  // where to stop scanning. All other types cannot be nested anyway.
+  assert(tag.type == HTML::Tag::ELEMENT);
+
+  // TT_TAG_START is already consumed.
+  markup::Scanner::TokenType token;
+  size_t inside = 0;
+
+  // Consume the full open tag, i.e. all its attributes
+  while (!inside) {
+    token = scanner.next();
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>", name);
+      case markup::Scanner::TT_ATTRIBUTE:
+        tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
+        break;
+      default:
+        // Not an attribute! Must be something inside the body or the closing
+        // tag already. Time to jump to the next loop.
+        ++inside;
+        break;
+    }
+  }
+
+  // Last token was something that would have triggered Scanner::scanBody(),
+  // which sets value() to start pointing at the body.
+  const char *start = scanner.start();
+
+  // Consume the rest of the HTML until (including) the final closing tag. We
+  // start with the token that caused the previous loop to fall into the default
+  // case.
+  while (inside) {
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>");
+      case markup::Scanner::TT_TAG_START:
+        // Note: Looking specifically for only our own type of tag so we don't
+        // have to care about whether other tags we encounter are void tags or
+        // not. Does assume the HTML is valid, as no stack is kept.
+        if (toLowerCase(scanner.tag()) == name) ++inside;
+        break;
+      case markup::Scanner::TT_TAG_END:
+        if (toLowerCase(scanner.tag()) == name) --inside;
+        break;
+      default:
+        break;
+    }
+
+    // Only continue scanning if we're still inside. We could have just read the
+    // TT_TAG_END token that ended this element, and we don't want to continue
+    // consuming tokens at that point.
+    if (inside) token = scanner.next();
+  }
+
+  // Only a TAG_END could have stopped the previous loop. We take the start
+  // of the final closing tag as the end of our data.
+  assert(token == markup::Scanner::TT_TAG_END);
+  const char *end = scanner.start();
+
+  // All data between the end of the first open element, and the start of the
+  // last close element, we just treat as raw data that will be printed when
+  // this tag is eventually printed.
+  assert(end >= start);
+  tag.data = std::string_view(start, end - start);
+}
+
 }  // namespace

 namespace marian::bergamot {

-// Formatters used for exception messages combined with format()
+/// Formatters used for formatting error messages in ABORT() calls.
 std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
  if (tag == nullptr) return out << "[nullptr]";
  switch (tag->type) {
@ -262,7 +337,7 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
  return out << "[Unknown tag type]";
 }

-std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
+std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
  for (auto it = tags.begin(); it != tags.end(); ++it) {
    if (it != tags.begin()) out << ' ';
    out << *it;
@ -270,18 +345,20 @@ std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
  return out;
 }

-HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
-  if (!process_markup) return;
+HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
+  if (!processMarkup) return;

  std::string original = std::move(source);
  markup::instream in(original.data(), original.data() + original.size());
  markup::Scanner scanner(in);
  source.clear();  // source is moved out of, so should be clear anyway

-  Tag *tag;
-  Taint stack;
-  bool addSentenceBreak = false;
-  bool addSpace = false;
+  Tag *tag = nullptr;             // current tag (after opening at least)
+  TagStack stack;                 // stack of currently open tags
+  bool addSentenceBreak = false;  // whether to add a sentence break next text segment
+  bool addWordBreak = false;      // whether to add a word break next text segment
+
+  // Starting point: an empty span with no open tags.
  spans_.push_back(Span{0, 0, {}});

  bool stop = false;
@ -298,13 +375,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
        // If the previous segment was the open or close tag of a block element
        // we treat the text after it as a new sentence.
        if (addSentenceBreak) {
-          if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
+          // If there isn't already a \n\n at the end of source...
+          if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
            stack.push_back(makeTag({Tag::WHITESPACE}));
            // Important: span->size() == 0 to make it behave as a void element.
            // Also important: position before the \n\n tokens, not after, to
            // make it easier to remove them later through apply().
            spans_.push_back(Span{source.size(), source.size(), stack});
-            source.append("\n\n");  // TODO assumes ssplit-mode = wrapped_text
+            source.append("\n\n");  // Should work with ssplit-mode = wrapped_text
            stack.pop_back();
          }
          addSentenceBreak = false;
@ -312,24 +390,27 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio

        // If the previous segment was an open or close tag, it might be best
        // to add a space to make sure we don't append to the previous word.
-        if (addSpace) {
-          if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
-              !std::isspace(scanner.value()[0])) {
+        if (addWordBreak) {
+          // Only add the space when it would be inside a word. Do not add it if
+          // it would be between a word and punctuation.
+          if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
            source.push_back(' ');
          }
-          addSpace = false;
+          addWordBreak = false;
        }

+        // Store which tags were open when this span of text was encountered.
        auto begin = source.size();
        source.append(scanner.value());
        spans_.push_back(Span{begin, source.size(), stack});
      } break;

      case markup::Scanner::TT_TAG_START: {
-        std::string name(scanner.tag());
+        std::string name = toLowerCase(scanner.tag());

        // Tag *tag is used by attribute parsing
-        tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
+        auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
+        tag = makeTag({type, std::string(scanner.tag())});

        stack.push_back(tag);

@ -341,39 +422,48 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
          stack.pop_back();
        }

+        // Ignored tags have same semantics as void tags with regards to moving
+        // them around with the rest of the content.
+        if (contains(options_.ignoredTags, name)) {
+          consumeIgnoredTag(scanner, *tag, name);
+          spans_.push_back(Span{source.size(), source.size(), stack});
+          stack.pop_back();
+        }
+
        // Treat non-inline HTML tags as spaces that break up words.
-        if (!contains(options_.inlineTags, tag->name)) {
+        if (!contains(options_.inlineTags, name)) {
          addSentenceBreak = true;
-        } else {
-          addSpace = true;
+        } else if (!contains(options_.inWordTags, name)) {
+          addWordBreak = true;
        }
      } break;

-      case markup::Scanner::TT_TAG_END:
+      case markup::Scanner::TT_TAG_END: {
+        std::string tagName = toLowerCase(scanner.tag());
        // If this is the closing bit of a void tag, i.e. triggered by the "/>"
        // bit of "<img/>", then completely ignore it.
-        if (contains(options_.voidTags, std::string(scanner.tag()))) break;
+        if (contains(options_.voidTags, tagName)) break;

        ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag());

-        ABORT_IF(stack.back()->name != scanner.tag(), "Encountered unexpected closing tag </{}>, stack is {}",
-                 scanner.tag(), stack);
+        ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()),
+                 "Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack);

        // What to do with "<u></u>" case, where tag is immediately closed
        // so it never makes it into the taint of any of the spans? This adds
        // an empty span so it still gets recorded in spans_.
-        if (spans_.empty() || !containsTag(spans_.back().tags, stack.back()))
+        if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
          spans_.push_back(Span{source.size(), source.size(), stack});

        stack.pop_back();

        // Add space if necessary
-        if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
+        if (!contains(options_.inlineTags, tagName)) {
          addSentenceBreak = true;
-        } else {
-          addSpace = true;
+        } else if (!contains(options_.inWordTags, tagName)) {
+          addWordBreak = true;
        }
-        break;
+      } break;

      case markup::Scanner::TT_ATTRIBUTE:
        assert(tag != nullptr);
@ -448,10 +538,10 @@ void HTML::restore(Response &response) {

  // Find for every token in target the token in source that best matches.
  std::vector<std::vector<size_t>> alignments;
-  hardAlignments(response, alignments);
+  hardAlignments(response, alignments, sourceTokenSpans);

  std::vector<SpanIterator> targetTokenSpans;
-  copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
+  copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
  assert(targetTokenSpans.size() == debugCountTokens(response.target));

  AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
@ -466,7 +556,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
                                 // and the while-loop below will do the rest
  assert(prevIt == spans_.end() || prevIt->tags.empty());

-  return apply(in, [&](ByteRange range, string_view token, bool last) {
+  return in.apply([&](ByteRange range, string_view token, bool last) {
    TokenFormatter formatter(token);

    // Potential issue: spans and tokens can intersect, e.g.
@ -475,9 +565,11 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
    //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
    //  tokens     |111111111111111|2|
    //
-    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
-    // Note: only relevant if isBlockElement is used. If we just insert spaces
-    // around all elements, every segment of `hello` will be a token.
+    // Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
+    // `<p><u>`?
+    // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
+    // just insert spaces around all elements, every segment of `hello` will be
+    // a token.

    // Seek to the last span that overlaps with this token
    while (true) {
@ -494,7 +586,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera

    // TODO: This is just the taint of the last span, not the ones in between.
    // This makes us lose some markup of parts of tokens as described above.
-    sourceTokenSpans.push_back(prevIt);
+    sourceTokenSpans.emplace_back(prevIt);

    return std::move(formatter.html());
  });
@ -503,27 +595,28 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
 AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
  auto prevSpan = spans_.cbegin();
  auto targetSpanIt = targetTokenSpans.begin();
+  auto straggerSpanIt = spans_.cbegin();

-  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
+  AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
    TokenFormatter formatter(token);

    // First we scan through spans_ to catch up to the span assigned to this
    // token. We're only interested in empty spans (empty and void elements)
-    for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
+    for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) {
      // We're only interested in empty spans or spans that would otherwise get
      // lost because they didn't align with anything between the spans in
      // targetSpanIt
      // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
-      if (span_it->size() != 0 &&
-          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
+      if (straggerSpanIt->size() != 0 &&
+          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end())
        continue;

-      formatter.append(prevSpan->tags, span_it->tags);
+      formatter.append(prevSpan->tags, straggerSpanIt->tags);

      // Note: here, not in 3rd part of for-statement because we don't want to
      // set prevSpan if the continue clause at the beginning of this for-loop
      // was hit.
-      prevSpan = span_it;
+      prevSpan = straggerSpanIt;
    }

    // Now do the same thing but for our target set of tags. Note that we cannot
@ -539,7 +632,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
      // the last token of the output. But lets assume someone someday changes
      // HardAlignments(), and then this for-loop will be necessary.
      // assert((*targetSpanIt)->tags.empty());
-      formatter.append((*targetSpanIt)->tags, HTML::Taint());
+      formatter.append((*targetSpanIt)->tags, HTML::TagStack());
    }

    prevSpan = *targetSpanIt;
@ -559,8 +652,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) {
  return &pool_.front();
 }

-void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                     std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
+void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                        std::vector<SpanIterator> const &sourceTokenSpans,
+                        std::vector<SpanIterator> &targetTokenSpans) {
  size_t offset = 0;  // Sentence offset in sourceTokenSpans

  // Fill targetTokenSpans based on the alignments we just made up.
@ -584,14 +678,25 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
 // to determine whether we should share the markup, or whether we should see
 // this token as a fresh start. This implementation will treat "hello[world]"
 // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
-bool HTML::isContinuation(string_view prev, string_view str) {
+bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
  if (options_.continuationDelimiters.empty()) return false;
  if (prev.empty() || str.empty()) return false;
  return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
         options_.continuationDelimiters.find(prev.back()) == std::string::npos;
 }

-void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
+  return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
+}
+
+/// Selects for each token in `response.target` a best source token from
+/// `response.source` and writes this selection to `alignments`. The source
+/// token spans are used to also look at the markup applied to each token to
+/// figure out which source token best represents each target token.
+void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                          std::vector<SpanIterator> const &sourceTokenSpans) {
+  size_t offset = 0;  // sentence offset in sourceTokenSpans
+
  // For each sentence...
  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
    alignments.emplace_back();
@ -600,14 +705,9 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
    // Note: only search from 0 to N-1 because token N is end-of-sentence token
    // that can only align with the end-of-sentence token of the target
    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      size_t maxS = 0;
-      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
-        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
-          maxS = s;
-        }
-      }
-
-      alignments.back().push_back(maxS);
+      alignments.back().push_back(
+          std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
+          response.alignments[sentenceIdx][t].begin());
    }

    // Next, we try to smooth out these selected alignments with a few heuristics
@ -622,7 +722,14 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
        float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
        float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];

-        if (currScore >= prevScore) {
+        TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+        TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+
+        // If this token has more markup, or a better score than the previous
+        // token (and they together are part of a word-ish thing) then mark
+        // this word as aligning. Otherwise just copy the alignment source of
+        // the previous token.
+        if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
          // Apply this to all previous tokens in the word
          for (size_t i = t;; --i) {
            alignments.back()[i] = currSentenceIdx;
@ -640,6 +747,8 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size

    // Always align target end with source end
    alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
  }
 }

--- a/src/translator/html.h
+++ b/src/translator/html.h
@ -2,51 +2,123 @@
 #define SRC_BERGAMOT_HTML_H_

 #include <forward_list>
+#include <set>
 #include <stdexcept>
 #include <string>
-#include <unordered_set>
+#include <string_view>

 #include "annotation.h"
+#include "data/types.h"
 #include "definitions.h"

-namespace marian {
-namespace bergamot {
+namespace marian::bergamot {

 struct Response;

+/// HTML class parses and removes HTML from input text, and places it back into
+/// the translated output text.
+///
+/// When parsing the HTML, it treats tags as markup, where a list of nested tags
+/// can be seen as a list of markups that are applicable to all the text that
+/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
+/// closes, a new TagStack is created to reflect that. TagStack used to be
+/// called `Taint` because it *tainted* the text it was associated with with
+/// those tags as markup. The text between tags themselves is stored in the
+/// input variable. In `spans_`, the TagStack that is associated with a
+/// substring of that text is stored.
+/// When transferring the HTML from the source text to the translated target
+/// text, the TagStacks are first associated with each of the subwords from the
+/// source text. Using hard alignment, each subword in the source text is linked
+/// to a subword in the target text. The TagStacks are then copied over these
+/// links. Finally, the HTML is inserted back into the target text by for each
+/// subword, comparing the TagStack from the previous word to that word, and
+/// opening and closing elements to make up for the difference.
+///
+/// There are a couple of complexities though:
+/// 1. Not all tags can be treated as markup applied to text. For example, an
+///    `<img>` does not contain text itself. Or `<i></i>` does not. We do want
+///    those tags to remain in the output though. We do this by associating
+///    them to an empty `Span`. When inserting HTML back into the translation
+///    input or output, we keep track of where in the `spans_` vector we are,
+///    and insert any elements from empty spans that we might have skipped over
+///    because empty spans are never linked to tokens/subwords. These are
+///    *stragglers* in some parts of the code, or *void* or *empty* elements in
+///    other parts.
+/// 2. Some tags should be treated as paragraph indicators, and break up
+///    sentences. These are the usual suspects like `<p>`, but also `<li>` and
+///    `<td>`, to make sure we don't translate two table cells into a single
+///    word. This is the `addSentenceBreak` flag in the HTML parsing bit.
+///    We mark these breaks with `\n\n` in the input text and with a special
+///    WHITESPACE tag that we treat as any other void tag. Hopefully this tag
+///    moves with the added `\n\n` and it is easy for us to remove it again.
+///    (in practise it is since these only occur at the end of sentences and
+///    the end of sentences are always aligned between source and target.)
+/// 3. We treat most tags as word-breaking. We do this by adding spaces just
+///    after where we saw the open or close tag occur. If there is already
+///    some whitespace in that place, we do not add extra spaces.
+/// 4. TODO
 class HTML {
 public:
+  using TagNameSet = std::set<std::string, std::less<>>;
+
+  /// Options struct that controls how HTML is interpreted.
  struct Options {
-    // List of elements for which we do not expect a closing tag, or self-closing
-    // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
-    // More relevant source of this list:
-    // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
-    std::unordered_set<std::string> voidTags{"area",  "base",  "basefont", "bgsound", "br",    "col",
-                                             "embed", "frame", "hr",       "img",     "input", "keygen",
-                                             "link",  "meta",  "param",    "source",  "track", "wbr"};
+    /// List of elements for which we do not expect a closing tag, or
+    /// self-closing elements in XHTML. We do not need to see a closing tag
+    /// for these elements, and they cannot contain text or tags themselves.
+    /// See also:
+    /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
+    /// More relevant source of this list:
+    /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
+                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};

-    std::unordered_set<std::string> inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
-                                               "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
-                                               "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
+    /// List of elements that are treated as inline, meaning they do not break
+    /// up sentences. Any element *not* in this list will cause the text that
+    /// follows its open or close tag to be treated as a separate sentence.
+    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
+                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
+                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};

-    // List of characters that occur at the start of a token that indicate that
-    // the this token is probably *not* a continuation of a word. Set to empty
-    // to never mark a token as a continuation of the word.
-    // std::string continuationDelimiters = "\n ,.(){}[]";
-    std::string continuationDelimiters;
+    /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
+    /// not substituted with spaces. Technically almost all inline elements
+    /// should be treated like this, except `<br>` maybe, But in practice it
+    /// seems to be more effective to limit this set to just that one tag that
+    /// that can only really be used *inside* words: `<wbr>`.
+    /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+    TagNameSet inWordTags{"wbr"};

-    // Should we always add spaces to the places where tags used to be? I.e.
-    // `un<u>der</u>line` should become `un der line`?
+    /// List of elements we copy as is, but do parse as if they're HTML because
+    /// they could be nested. For <script> we just scan for </script> because
+    /// the script tag may not be nested, but that is not the case for these
+    /// elements per se. Some tags, like <script>, are ignored at the `Scanner`
+    /// level. See `xh_scanner.cpp/Scanner::scanAttribute()`.
+    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
+
+    /// List of characters that occur at the start of a token that indicate that
+    /// the this token is probably *not* a continuation of a word. This is also
+    /// used to determine whether there should be a space after a closing tag
+    /// or not. I.e. a `.` after a `</strong>` does not need to be separated by
+    /// an extra space.
+    std::string continuationDelimiters = "\n ,.(){}[]";
+
+    /// Should we always add spaces to the places where tags used to be? I.e.
+    /// `un<u>der</u>line` should become `un der line`? This does help with
+    /// retaining tags inside words, or with odd pages that use CSS to add
+    /// spacing between a lot of tags. Cases like `<td>` and `<li>` are already
+    /// covered by treating them as sentence splitting.
    bool substituteInlineTagsWithSpaces = true;
  };

+  /// Represents a tag, or markup that is being applied to a string of text.
+  /// We treat all elements except `ELEMENT` as void elements or empty elements.
  struct Tag {
    enum NodeType {
-      ELEMENT,
-      VOID_ELEMENT,
-      COMMENT,
-      PROCESSING_INSTRUCTION,
-      WHITESPACE,  // negative space
+      ELEMENT,                 // <b>...</b>
+      VOID_ELEMENT,            // <img>
+      COMMENT,                 // <!-- ... -->
+      PROCESSING_INSTRUCTION,  // <?...?>
+      WHITESPACE,              // A \n\n we inserted to break a sentence.
    };

    NodeType type;           // Type of the node
@ -55,48 +127,94 @@ class HTML {
                             // entities and prefix whitespace)
    std::string data;        // Raw data of an element that just needs to be
                             // copied as is, e.g. <script> or <style>
-    // @TODO: if the original HTML stays in memory, we could replace
-    // `attributes` and `data` with string_views pointing to it.
  };

-  using Taint = std::vector<Tag *>;
+  /// Representation of markup that is being applied to a string of text. Order
+  /// matters as this represents how the tags are nested. The `Tag` objects
+  /// themselves are owned by `pool_`.
+  using TagStack = std::vector<Tag *>;

+  /// Span of text, with which a `TagStack` is associated. A span may be empty,
+  /// for example to represent the presence of an empty or VOID element.
  struct Span {
-    size_t begin;
-    size_t end;
-    Taint tags;  // Note: free pointers! Lifetime of tags is managed by pool_
+    size_t begin;   // Start offset in (plain text) source
+    size_t end;     // end offset in source
+    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
    inline size_t size() const { return end - begin; }
  };

-  explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
-  explicit HTML(std::string &&source, bool process_markup, Options &&options);
+  /// Parses HTML in `source` (if `processMarkup` is true). `source` is updated
+  /// to only contain the plain text extracted from the HTML. `HTML` instance
+  /// retains information about what tags are extracted from where to later
+  /// reconstruct the HTML in a `Response` object (both `source` and `target`).
+  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
+  explicit HTML(std::string &&source, bool processMarkup, Options &&options);
+
+  /// It is not save to copy a HTML instance.
+  HTML(const HTML &) = delete;
+
+  /// Moving is fine
+  HTML(HTML &&) = default;
+
+  /// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
+  /// and uses alignment information to also reconstruct the same markup in
+  /// `response.target`.
  void restore(Response &response);

 private:
  using SpanIterator = std::vector<HTML::Span>::const_iterator;
  using AnnotatedText = marian::bergamot::AnnotatedText;

+  /// Reconstructs HTML in `response.source` (passed as `in`) and makes a list
+  /// `sourceTokenSpans` that associates a `Span` with each subword in `in`.
+  /// We later use these span pointers to copy tags. They're iterators (or
+  /// pointers into a list) to be able to compare whether one span came before
+  /// or after another span.
  AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
+
+  /// Inserts the HTML into `response.target` (passed as `in`) based on
+  /// `targetTokenSpans`, which points to a `Span` for each token (subword) in
+  /// `response.target`.
  AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
-  void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                 std::vector<HTML::SpanIterator> const &sourceTokenSpans,
-                 std::vector<HTML::SpanIterator> &targetTokenSpans);
-  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
-  bool isContinuation(string_view prev, string_view str);
-  // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
-  // in Taints. Pointer is valid as long as this HTML instance lives on.
+
+  /// Utilities to test whether subword `str` is part of a word together with
+  /// the subword `prev`, or a separate word. Basically *does `str` start with
+  /// a space, but bit more complex to deal with punctuation.
+  bool isContinuation(marian::string_view prev, marian::string_view str) const;
+  bool isContinuation(std::string_view prev, std::string_view str) const;
+
+  /// Copies span pointers from the subwords/tokens from the source text to the
+  /// subwords of the target text in `targetTokenSpans` using alignment
+  /// information in `response`.
+  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                    std::vector<HTML::SpanIterator> &targetTokenSpans);
+
+  /// Turns the alignment scores in `response.alignments` into one source token
+  /// per target token. Has some heuristics to keep all target tokens of a
+  /// single word pointing to the same span, and prefers spans with more markup
+  /// over spans with less to try to retain as much of the input markup as
+  /// possible.
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
+
+  /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
+  /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
  Tag *makeTag(Tag &&tag);

+  /// HTML options associated with this parse.
  Options options_;

-  // List of text spans, and which tags are applied to them
+  /// List of spans of text in plain text `source`, and which tags are applied
+  /// to them.
  std::vector<Span> spans_;

-  // a pool of tags that we free when HTML goes out of scope
+  /// A pool of tags. `std::forward_list` because we do not want pointers to it
+  /// to be invalidated when new tags are allocated. This way it is easy to
+  /// deallocate them all when `HTML` goes out of scope.
  std::forward_list<Tag> pool_;
 };

-}  // namespace bergamot
-}  // namespace marian
+}  // namespace marian::bergamot

 #endif  // SRC_BERGAMOT_HTML_H_
--- a/src/translator/response_options.h
+++ b/src/translator/response_options.h
@ -19,7 +19,7 @@ struct ResponseOptions {
  bool qualityScores{false};  ///< Include quality-scores or not.
  bool alignment{false};      ///< Include alignments or not.

-  bool HTML{false};  /// Remove HTML tags from text and (TODO) insert in output.
+  bool HTML{false};  /// Remove HTML tags from text and insert in output.

  /// Whether to include sentenceMappings or not. Alignments require
  /// sentenceMappings and are available irrespective of this option if
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@ -37,6 +37,11 @@ bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
  return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
 }

+template <size_t N>
+constexpr size_t length(char const (&/*unused*/)[N]) {
+  return N - 1;
+}
+
 }  // end namespace

 namespace markup {
@ -52,6 +57,8 @@ std::string_view Scanner::tag() const { return std::string_view(tagName_.data, t
 Scanner::TokenType Scanner::scanBody() {
  value_ = string_ref{input_.pos(), 0};

+  start_ = input_.pos();
+
  switch (input_.peek()) {
    case '\0':
      return TT_EOF;
@ -97,15 +104,16 @@ Scanner::TokenType Scanner::scanAttribute() {
  switch (input_.peek()) {
    case '>':
      input_.consume();
-      if (equalsCaseInsensitive(tagName_, "script")) {
+
+      // Treat some elements as opaque, e.g. <script>, <style>
+      if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
+          equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
+          equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
+          equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
        // script is special because we want to parse the attributes,
        // but not the content
        scanFun_ = &Scanner::scanSpecial;
        return scanSpecial();
-      } else if (equalsCaseInsensitive(tagName_, "style")) {
-        // same with style
-        scanFun_ = &Scanner::scanSpecial;
-        return scanSpecial();
      } else {
        scanFun_ = &Scanner::scanBody;
        return scanBody();
@ -198,10 +206,11 @@ Scanner::TokenType Scanner::scanAttribute() {
 // - TT_ENTITY_START
 // - TT_ERROR if unexpected character or end
 Scanner::TokenType Scanner::scanTag() {
+  start_ = input_.pos();
  if (input_.consume() != '<') return TT_ERROR;

-  bool is_tail = input_.peek() == '/';
-  if (is_tail) input_.consume();
+  bool isTail = input_.peek() == '/';
+  if (isTail) input_.consume();

  tagName_ = string_ref{input_.pos(), 0};

@ -226,7 +235,7 @@ Scanner::TokenType Scanner::scanTag() {

  if (!input_.peek()) return TT_EOF;

-  if (is_tail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
+  if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;

  scanFun_ = &Scanner::scanAttribute;
  return TT_TAG_START;
@ -234,6 +243,7 @@ Scanner::TokenType Scanner::scanTag() {

 Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
  // `entity` includes starting '&' and ending ';'
+  start_ = input_.pos();
  string_ref entity{input_.pos(), 0};
  bool hasEnd = false;

@ -312,11 +322,13 @@ bool Scanner::isWhitespace(char c) {

 Scanner::TokenType Scanner::scanComment() {
  if (gotTail_) {
+    start_ = input_.pos() - length("-->");  // minus "-->"
    scanFun_ = &Scanner::scanBody;
    gotTail_ = false;
    return TT_COMMENT_END;
  }

+  start_ = input_.pos();
  value_ = string_ref{input_.pos(), 0};

  while (true) {
@ -325,7 +337,7 @@ Scanner::TokenType Scanner::scanComment() {

    if (endsWith(value_, "-->")) {
      gotTail_ = true;
-      value_.size -= 3;
+      value_.size -= length("-->");
      break;
    }
  }
@ -334,11 +346,13 @@ Scanner::TokenType Scanner::scanComment() {

 Scanner::TokenType Scanner::scanProcessingInstruction() {
  if (gotTail_) {
+    start_ = input_.pos() - length("?>");
    scanFun_ = &Scanner::scanBody;
    gotTail_ = false;
    return TT_PROCESSING_INSTRUCTION_END;
  }

+  start_ = input_.pos();
  value_ = string_ref{input_.pos(), 0};

  while (true) {
@ -347,7 +361,7 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {

    if (endsWith(value_, "?>")) {
      gotTail_ = true;
-      value_.size -= 2;
+      value_.size -= length("?>");
      break;
    }
  }
@ -356,11 +370,13 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {

 Scanner::TokenType Scanner::scanSpecial() {
  if (gotTail_) {
+    start_ = input_.pos() - (tagName_.size + length("</>"));
    scanFun_ = &Scanner::scanBody;
    gotTail_ = false;
    return TT_TAG_END;
  }

+  start_ = input_.pos();
  value_ = string_ref{input_.pos(), 0};

  while (true) {
@ -369,17 +385,17 @@ Scanner::TokenType Scanner::scanSpecial() {

    // Test for </tag>
    // TODO: no whitespaces allowed? Is that okay?
-    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + 3) {
+    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
      // Test for the "</"" bit of "</tag>"
-      size_t pos_tag_start = value_.size - tagName_.size - 3;
-      if (std::memcmp(value_.data + pos_tag_start, "</", 2) != 0) continue;
+      size_t posTagStart = value_.size - tagName_.size - length("</>");
+      if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;

      // Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
-      size_t pos_tag_name = value_.size - tagName_.size - 1;  // end - tag>
-      if (!equalsCaseInsensitive(value_.data + pos_tag_name, tagName_.data, tagName_.size)) continue;
+      size_t posTagName = value_.size - tagName_.size - length(">");  // end - tag>
+      if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;

      gotTail_ = true;
-      value_.size -= tagName_.size + 3;
+      value_.size -= tagName_.size + length("</>");
      break;
    }
  }
--- a/src/translator/xh_scanner.h
+++ b/src/translator/xh_scanner.h
@ -83,6 +83,7 @@ class Scanner {
        tagName_{nullptr, 0},
        attributeName_{nullptr, 0},
        input_(is),
+        start_(nullptr),
        scanFun_(&Scanner::scanBody),
        gotTail_(false) {}

@ -98,6 +99,8 @@ class Scanner {
  // get tag name
  std::string_view tag() const;

+  inline const char *start() const { return start_; }
+
 private: /* methods */
  typedef TokenType (Scanner::*ScanPtr)();

@ -137,6 +140,9 @@ class Scanner {

  instream &input_;

+  // Start position of a token.
+  const char *start_;
+
  bool gotTail_;  // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
 };
 }  // namespace markup