HTML transfer empty elements (#283)

* Fix test case This should now be implemented * Remove FilterEmpty This path wasn't used anymore anyway, empty tags just got their own spans, and never reached the stack. * Insert skipped empty source spans into target HTML Also refactor variable names to better match their contents and be more consistent with each other. This implementation passes all test cases, finally! * Fix remaining style changes * Move HTML formatting to its own section That code had become exact copies in three different places
2024-09-11 13:46:12 +03:00 · 2021-12-21 14:44:04 +01:00 · 2021-12-21 14:44:04 +01:00 · f55377b687
commit f55377b687
parent bcbbfe1295
2 changed files with 146 additions and 123 deletions
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@ -162,7 +162,7 @@ TEST_CASE("Do not abort if the input is just empty element") {
  Response response;
  html.restore(response);
  CHECK(response.source.text == "<p></p>");
-  CHECK(response.target.text == "");  // Should be <p></p> but hey not there yet.
+  CHECK(response.target.text == "<p></p>");
 }

 TEST_CASE("Test case html entities") {
@ -388,7 +388,7 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") {
  CHECK(input == "hello ");
 }

-TEST_CASE("Test empty tag", "[!mayfail]") {
+TEST_CASE("Test empty tag") {
  std::string test_str(
      "<p id=\"1\">hello <img id=\"1.1\"><span id=\"1.2\"><u id=\"1.2.1\"></u><b id=\"1.2.2\"></b><img "
      "id=\"1.2.3\">world</span></p>\n");
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@ -12,7 +12,7 @@ using marian::bergamot::Response;

 void encodeEntities(string_view const &input, std::string &output) {
  output.clear();
-  output.reserve(input.size());
+  output.reserve(input.size());  // assumes there are no entities in most cases

  for (auto it = input.begin(); it != input.end(); ++it) {
    switch (*it) {
@ -83,6 +83,21 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
  return os.str();
 }

+// Syntactic sugar around rbegin() and rend() that allows me to write
+// `for (auto &&item : reversed(container))` instead of the needlessly verbose
+// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
+template <typename T>
+class reversed {
+ public:
+  typedef typename T::const_reverse_iterator iterator;
+  explicit reversed(T const &container) : container_(container){};
+  iterator begin() const { return container_.rbegin(); }
+  iterator end() const { return container_.rend(); }
+
+ private:
+  T const &container_;
+};
+
 bool isBlockElement(std::string_view const &name) {
  // List of elements that we expect might occur inside words, and that should
  // not introduce spacings around them. Not strictly inline elements, nor flow
@ -125,16 +140,6 @@ bool intersects(ByteRange const &range, HTML::Span const &span) {
  return range.begin <= span.end && range.end >= span.begin;
 };

-void filterEmpty(HTML::Taint &stack) {
-  auto src = stack.begin();
-  auto dst = stack.begin();
-
-  for (auto src = stack.begin(); src != stack.end(); ++src)
-    if (!(*src)->empty) *(dst++) = *src;
-
-  stack.resize(dst - stack.begin());
-}
-
 bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
  return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }
@ -159,11 +164,11 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
    // expects
    // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
    // directly
-    std::vector<string_view> token_views(tokens.size());
-    std::transform(tokens.begin(), tokens.end(), token_views.begin(),
+    std::vector<string_view> views(tokens.size());
+    std::transform(tokens.begin(), tokens.end(), views.begin(),
                   [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); });

-    out.appendSentence(prefix, token_views.begin(), token_views.end());
+    out.appendSentence(prefix, views.begin(), views.end());
  }

  out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true));
@ -200,14 +205,14 @@ void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &
    // Note: only search from 0 to N-1 because token N is end-of-sentence token
    // that can only align with the end-of-sentence token of the target
    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      size_t s_max = 0;
+      size_t maxS = 0;
      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
-        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][s_max]) {
-          s_max = s;
+        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
+          maxS = s;
        }
      }

-      alignments.back().push_back(s_max);
+      alignments.back().push_back(maxS);
    }

    // Next, we try to smooth out these selected alignments with a few heuristics
@ -241,52 +246,84 @@ void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &
  }
 }

+// Internal type used to point to a position in HTML::spans_.
+typedef std::vector<HTML::Span>::const_iterator SpanIterator;
+
 void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-               std::vector<HTML::Taint> const &sourceTokenTags, std::vector<HTML::Taint> &targetTokenTags) {
+               std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
  size_t offset = 0;

-  // Fill targetTokenTags based on the alignments we just made up.
+  // Fill targetTokenSpans based on the alignments we just made up.
  // NOTE: this should match the exact order of Apply()
  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
-    targetTokenTags.push_back(sourceTokenTags[offset]);  // token_tag for sentence ending gap
+    targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for sentence ending gap
    for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
      size_t s = alignments[sentenceIdx][t];
      assert(s < response.source.numWords(sentenceIdx));
-      targetTokenTags.push_back(sourceTokenTags[offset + 1 + s]);  // +1 for prefix gap
+      targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]);  // +1 for prefix gap
    }

    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
  }

-  assert(offset < sourceTokenTags.size());
-  targetTokenTags.push_back(sourceTokenTags[offset]);  // token_tag for ending whitespace
+  assert(offset < sourceTokenSpans.size());
+  targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for ending whitespace
 }

-AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Taint> &token_tags,
-                            std::vector<HTML::Span>::const_iterator span_it,
-                            std::vector<HTML::Span>::const_iterator span_end) {
-  auto prev_it = span_it;  // safe because first span is always empty span, and
-                           // and the while-loop below will do the rest
+// Little helper class to append HTML to a token
+class TokenFormatter {
+ public:
+  TokenFormatter(string_view token)
+      : html_(), offset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+    // Do encoding of any entities that popped up in the translation
+    encodeEntities(token, html_);
+  }

-  // workspace variables for lambda
-  std::string html;
-  HTML::Taint opening, closing;
+  std::string &&html() { return std::move(html_); }
+
+  // Append the markup necessary for moving from `prev` set of tags to `curr`.
+  void append(HTML::Taint const &prev, HTML::Taint const &curr) {
+    HTML::Taint opening, closing;
+
+    diffTags(prev, curr, opening, closing);
+
+    for (HTML::Tag const *tag : reversed(closing)) {
+      std::string closeTag = format("</{}>", tag->name);
+      html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
+      offset_ += closeTag.size();
+    }
+
+    for (HTML::Tag const *tag : opening) {
+      std::string openTag = format("<{}{}>", tag->name, tag->attributes);
+      html_.insert(offset_ + whitespaceSize_, openTag);
+      offset_ += openTag.size();
+      closeLeft_ = false;
+    }
+  }
+
+ private:
+  std::string html_;       // Output html
+  size_t offset_;          // Size added by prepending HTML
+  size_t whitespaceSize_;  // number of prefix whitespace characters
+
+  // Close tags we want to show up left (before) the token, but open tags
+  // ideally come directly after any prefix whitespace. However, some tokens
+  // match multiple spans. If a previous span has added an open tag, after any
+  // whitespace, and the next span closes said tag again, we need to close
+  // it after the whitespace. So after the first open tag, any closing tag
+  // should also align right, after whitespace, not before. Hence this bool.
+  bool closeLeft_;
+};
+
+AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
+                            std::vector<SpanIterator> &sourceTokenSpans) {
+  auto spanIt = sourceSpans.begin();
+  auto prevIt = sourceSpans.begin();  // safe because first span is always empty span, and
+                                      // and the while-loop below will do the rest
+  assert(prevIt == sourceSpans.end() || prevIt->tags.empty());

  return apply(in, [&](ByteRange range, string_view token, bool last) {
-    // Do encoding of any entities that popped up in the translation
-    // (Also effectively clears html from previous call)
-    encodeEntities(token, html);
-
-    size_t offset = 0;  // Size added by prepending HTML
-    size_t whitespace_size = countPrefixWhitespaces(token);
-
-    // Close tags we want to show up left (before) the token, but open tags
-    // ideally come directly after any prefix whitespace. However, some tokens
-    // match multiple spans. If a previous span has added an open tag, after any
-    // whitespace, and the next span closes said tag again, we need to close
-    // it after the whitespace. So after the first open tag, any closing tag
-    // should also align right, after whitespace, not before. Hence this bool.
-    bool close_left = true;
+    TokenFormatter formatter(token);

    // Potential issue: spans and tokens can intersect, e.g.
    //
@ -295,27 +332,16 @@ AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Taint> &t
    //  tokens     |111111111111111|2|
    //
    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
+    // Note: only relevant if isBlockElement is used. If we just insert spaces
+    // around all elements, every segment of `hello` will be a token.

    // Seek to the last span that overlaps with this token
    while (true) {
-      diffTags(prev_it->tags, span_it->tags, opening, closing);
-      prev_it = span_it;
+      formatter.append(prevIt->tags, spanIt->tags);
+      prevIt = spanIt;

-      for (auto cit = closing.crbegin(); cit != closing.crend(); ++cit) {
-        std::string close_tag = format("</{}>", (*cit)->name);
-        html.insert(offset + (close_left ? 0 : whitespace_size), close_tag);
-        offset += close_tag.size();
-      }
-
-      for (HTML::Tag const *tag : opening) {
-        std::string open_tag = format("<{}{}>", tag->name, tag->attributes);
-        html.insert(offset + whitespace_size, open_tag);
-        offset += open_tag.size();
-        close_left = false;
-      }
-
-      if (span_it + 1 != span_end && ((span_it + 1)->begin < range.end || last)) {
-        span_it++;
+      if (spanIt + 1 != sourceSpans.end() && ((spanIt + 1)->begin < range.end || last)) {
+        spanIt++;
        continue;
      }

@ -323,71 +349,69 @@ AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Taint> &t
    }

    // TODO: This is just the taint of the last span, not the ones in between.
-    // This makes us lose empty tags, and maybe some markup as well, in the
-    // response target HTML restoration.
-    token_tags.push_back(prev_it->tags);
+    // This makes us lose some markup of parts of tokens as described above.
+    sourceTokenSpans.push_back(prevIt);

-    return html;
+    return std::move(formatter.html());
  });
 }

-AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<HTML::Taint> const &token_tags_target) {
-  auto token_prev_it = token_tags_target.begin();
-  auto token_tags_it = token_tags_target.begin() + 1;
-
-  // workspace for lambda
-  std::string html;
-  HTML::Taint opening, closing;
+AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
+                            std::vector<SpanIterator> const &targetTokenSpans) {
+  auto prevSpan = sourceSpans.begin();
+  auto targetSpanIt = targetTokenSpans.begin();

  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
-    // Do encoding of any entities that popped up in the translation
-    // (Also effectively clears html from previous call)
-    encodeEntities(token, html);
+    TokenFormatter formatter(token);

-    size_t offset = 0;  // Size added by prepending HTML
-    size_t whitespace_size = countPrefixWhitespaces(token);
+    // First we scan through spans_ to catch up to the span assigned to this
+    // token. We're only interested in empty spans (empty and void elements)
+    for (auto span_it = prevSpan + 1; span_it < *targetSpanIt; span_it++) {
+      // We're only interested in empty spans between the spans in targetSpanIt
+      if (span_it->size() != 0) continue;

-    assert(token_tags_it != token_tags_target.end());
-    diffTags(*token_prev_it, *token_tags_it, opening, closing);
+      formatter.append(prevSpan->tags, span_it->tags);

-    for (auto cit = closing.crbegin(); cit != closing.crend(); ++cit) {
-      std::string close_tag = format("</{}>", (*cit)->name);
-      html.insert(offset, close_tag);
-      offset += close_tag.size();
+      // Note: here, not in 3rd part of for-statement because we don't want to
+      // set prevSpan if the continue clause at the beginning of this for-loop
+      // was hit.
+      prevSpan = span_it;
    }

-    for (HTML::Tag const *tag : opening) {
-      std::string open_tag = format("<{}{}>", tag->name, tag->attributes);
-      html.insert(offset + whitespace_size, open_tag);
-      offset += open_tag.size();
-    }
+    // Now do the same thing but for our target set of tags. Note that we cannot
+    // combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
+    // because there is no guarantee that the order in `targetTokenSpans` is
+    // the same as that of `spans`.
+    formatter.append(prevSpan->tags, (*targetSpanIt)->tags);

    // If this is the last token of the response, close all open tags.
    if (last) {
-      for (auto cit = token_tags_it->crbegin(); cit != token_tags_it->crend(); ++cit) {
-        html += format("</{}>", (*cit)->name);
-      }
+      // Note: this assert is true due to our current implementation of
+      // HardAlignments() that always matches the last token of the input with
+      // the last token of the output. But lets assume someone someday changes
+      // HardAlignments(), and then this for-loop will be necessary.
+      // assert((*targetSpanIt)->tags.empty());
+      formatter.append((*targetSpanIt)->tags, HTML::Taint());
    }

-    ++token_prev_it;
-    ++token_tags_it;
+    prevSpan = *targetSpanIt++;

-    return html;
+    return std::move(formatter.html());
  });

  // Assert that we did in fact use all our taints
-  assert(token_tags_it == token_tags_target.end());
+  assert(targetSpanIt == targetTokenSpans.end());

  return out;
 }

 std::ostream &debugPrintMapping(std::ostream &out, Response const &response,
                                std::vector<std::vector<size_t>> const &alignments,
-                                std::vector<HTML::Taint> const &token_tags_target) {
-  auto taints = token_tags_target.begin();
+                                std::vector<SpanIterator> const &targetTokenSpans) {
+  auto spans = targetTokenSpans.begin();
  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
    out << "Mapped sentence prefix with tags: ";
-    for (auto &&taint : *(++taints)) out << '/' << taint->name;
+    for (auto &&taint : (*++spans)->tags) out << '/' << taint->name;
    out << '\n';

    for (size_t wordIdx = 0; wordIdx < response.target.numWords(sentenceIdx); ++wordIdx) {
@ -399,16 +423,16 @@ std::ostream &debugPrintMapping(std::ostream &out, Response const &response,
      out << " to ";
      out << std::setw(10) << std::setfill(' ') << response.source.word(sentenceIdx, alignments[sentenceIdx][wordIdx]);
      out << " with tags: ";
-      for (auto &&taint : *(++taints)) out << '/' << taint->name;
+      for (auto &&taint : (*++spans)->tags) out << '/' << taint->name;
      out << '\n';
    }
  }

  out << "Mapped end-of-input with tags: ";
-  for (auto &&taint : *(++taints)) out << '/' << taint->name;
+  for (auto &&taint : (*++spans)->tags) out << '/' << taint->name;
  out << '\n';

-  assert(++taints == token_tags_target.end());
+  assert(++spans == targetTokenSpans.end());
  return out;
 }

@ -467,7 +491,6 @@ HTML::HTML(std::string &&source, bool process_markup) {
        auto begin = source.size();
        source.append(scanner.value());
        spans_.push_back(Span{begin, source.size(), stack});
-        filterEmpty(stack);
      } break;

      case markup::Scanner::TT_TAG_START:
@ -539,32 +562,32 @@ void HTML::restore(Response &response) {

  // Reconstruction of HTML tags:
  // 1. Map each token to a Span
-  // 2. Apply the taint of that span to the token
-  // 3. Reconstruct the source HTML with these tainted tokens
-  // 4. Transfer the taint from the source tokens to the target tokens using alignment information
+  // 2. Reconstruct the source HTML with these tainted tokens
+  // 3. Transfer the spans from the source tokens to the target tokens using alignment information
+  // 4. For spans that represent empty elements (e.g. <img>) figure out their position
  // 5. Reconstruct the target HTML with these tainted tokens

-  std::vector<Taint> token_tags;  // List of HTML tags active per token in source
-                                  // Calculating these is a side-effect of restoring
-                                  // the HTML in response.source.
+  // sourceTokenSpans is a vector with a pointer to a span for each token. We
+  // use iterators here to point to these positions so we can easily compare if
+  // one span comes before or after another, information we'll need when we need
+  // to figure out whether we've skipped spans (of emtpy elements) when
+  // reconstructing HTML in response.target.
+  std::vector<SpanIterator> sourceTokenSpans;

-  AnnotatedText source = restoreSource(response.source, token_tags, spans_.cbegin(), spans_.cend());
-  assert(token_tags.size() == debugCountTokens(response.source));
+  // RestoreSource re-inserts HTML into the source text, but also identifies
+  // which span each source token fits into best.
+  AnnotatedText source = restoreSource(response.source, spans_, sourceTokenSpans);
+  assert(sourceTokenSpans.size() == debugCountTokens(response.source));

  // Find for every token in target the token in source that best matches.
  std::vector<std::vector<size_t>> alignments;
  hardAlignments(response, alignments);

-  std::vector<Taint> token_tags_target;
-  token_tags_target.emplace_back();  // add empty one to the beginning for easy
-                                     // life later on (we start iterating at 1,
-                                     // and can then do i - 1 for empty.
-  copyTaint(response, alignments, token_tags, token_tags_target);
-  assert(token_tags_target.size() == debugCountTokens(response.target) + 1);
+  std::vector<SpanIterator> targetTokenSpans;
+  copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
+  assert(targetTokenSpans.size() == debugCountTokens(response.target));

-  // DebugPrintMapping(std::cerr, response, alignments, token_tags_target);
-
-  AnnotatedText target = restoreTarget(response.target, token_tags_target);
+  AnnotatedText target = restoreTarget(response.target, spans_, targetTokenSpans);

  response.source = source;
  response.target = target;