Improve handling HTML special cases (#312)

- Prefer spreading markup over a full word.
- Ignore certain tags that are unlikely to be supposed to be translated,
  such as `<code>` and `<samp>`.
- Never treat `<wbr>` as a space.
- Allow for inconsistent cases in tag names.
- Fix bug where void elements were inserted multiple times.
- Better handling of whitespace around punctuation.
- Ignore parsing `<noscript>` to be compatible with Firefox.
- Improvements to documentation and readability of `HTML` and `Scanner`
  classes.

Fixes: #313, #339
This commit is contained in:
Jelmer 2022-02-22 20:25:34 +00:00 committed by GitHub
parent 9eb243725b
commit 1f98f971a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 530 additions and 170 deletions

@ -1 +1 @@
Subproject commit 3c0f95a1775a74f5db441aa2f17ceb7437679022
Subproject commit 3776609ce5f7a238245e303efaa007b2d5078180

View File

@ -172,6 +172,16 @@ TEST_CASE("Do not abort if the input is just empty element") {
CHECK(response.target.text == "<p></p>");
}
TEST_CASE("Tag names are case insensitive") {
// Tests <P> vs </p> and <BR> should be recognized as a void tag <br>.
// <B> should be recognized as inline.
std::string test_str("<P><B>Spa</B>ce<BR>please?</p>");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "Spa ce\n\nplease?");
}
TEST_CASE("Test case html entities") {
// These are all entities I would expect in innerHTML, since all other entities
// can be encoded as UTF-8 so there's no need to encode them through &...; when
@ -618,6 +628,72 @@ TEST_CASE("Test comment") {
CHECK(response.target.text == test_str);
}
TEST_CASE("Test <wbr> element") {
std::string test_str("hel<wbr>lo");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "hello");
}
TEST_CASE("Test <wbr> element (case-insensitive)") {
std::string test_str("hel<WBR>lo");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "hello");
}
TEST_CASE("Test ignored element (nested)") {
std::string test_str("foo <var><var>nested</var></var> bar");
std::string expected_str("foo <var><var>nested</var></var>bar");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "foo bar");
Response response;
std::string sentence_str("foo bar");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 3), // foo
string_view(sentence_str.data() + 3, 1), // _
string_view(sentence_str.data() + 4, 4), // _bar
string_view(sentence_str.data() + 8, 0), // ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.target.appendSentence("", sentence.begin(), sentence.end());
response.alignments = {identity_matrix<float>(4)};
html.restore(response);
CHECK(response.source.text == expected_str);
CHECK(response.target.text == expected_str);
}
TEST_CASE("Test ignored element (with entity)") {
std::string test_str("foo <var>&amp;</var> bar");
std::string expected_str("foo <var>&amp;</var>bar");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "foo bar");
Response response;
std::string sentence_str("foo bar");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 3), // foo
string_view(sentence_str.data() + 3, 1), // _
string_view(sentence_str.data() + 4, 4), // _bar
string_view(sentence_str.data() + 8, 0), // ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.target.appendSentence("", sentence.begin(), sentence.end());
response.alignments = {identity_matrix<float>(4)};
html.restore(response);
CHECK(response.source.text == expected_str);
CHECK(response.target.text == expected_str);
}
TEST_CASE("End-to-end translation", "[!mayfail]") {
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
HTML html(std::move(input), true);

View File

@ -185,6 +185,41 @@ struct AnnotatedText {
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }
/// Utility function to call `fun` on each word (subword token effectively) in
/// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
/// with the word, and a `bool` to indicate whether it is the last word in the
/// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
template <typename Fun>
AnnotatedText apply(Fun fun) const {
AnnotatedText out;
for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
std::string sentence;
std::vector<ByteRange> tokens;
std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
sentence += token;
}
// Convert our ByteRanges to string_views since that's what appendSentence
// expects
std::vector<marian::string_view> views(tokens.size());
std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
return marian::string_view(sentence.data() + range.begin, range.size());
});
out.appendSentence(prefix, views.begin(), views.end());
}
out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
return out;
}
private:
string_view asStringView(const ByteRange &byteRange) const {
return string_view(text.data() + byteRange.begin, byteRange.size());

View File

@ -1,21 +1,23 @@
#include "html.h"
#include <algorithm>
#include "response.h"
#include "xh_scanner.h"
namespace {
using marian::string_view;
using marian::bergamot::AnnotatedText;
using marian::bergamot::ByteRange;
using marian::bergamot::HTML;
using marian::bergamot::Response;
void encodeEntities(string_view const &input, std::string &output) {
/// Encodes the minimum of HTML entities.
void encodeEntities(marian::string_view const &input, std::string &output) {
output.clear();
output.reserve(input.size()); // assumes there are no entities in most cases
for (auto it = input.begin(); it != input.end(); ++it) {
switch (*it) {
for (char it : input) {
switch (it) {
case '&':
output.append("&amp;");
break;
@ -35,19 +37,30 @@ void encodeEntities(string_view const &input, std::string &output) {
// output.append("&apos;");
// break;
default:
output.push_back(*it);
output.push_back(it);
break;
}
}
}
size_t countPrefixWhitespaces(string_view const &input) {
/// Counts number of whitespace characters at the start of the input. Used
/// for determining where to insert an open or close tag.
size_t countPrefixWhitespaces(marian::string_view const &input) {
size_t size = 0;
while (size < input.size() && std::isspace(input[size])) ++size;
return size;
}
// Very simple replacement for std::format introduced in C++20
std::string toLowerCase(std::string_view const &input) {
std::string out;
out.resize(input.size());
std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
return out;
}
/// Very simple replacement for std::format introduced in C++20. Only supports
/// replacing `{}` in the template string with whatever `operator<<` for that
/// type turns it into.
std::string format(std::string const &formatTemplate) { return formatTemplate; }
template <typename Arg>
@ -68,14 +81,14 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
return os.str();
}
// Syntactic sugar around rbegin() and rend() that allows me to write
// `for (auto &&item : reversed(container))` instead of the needlessly verbose
// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
/// Syntactic sugar around rbegin() and rend() that allows me to write
/// `for (auto &&item : reversed(container))` instead of the needlessly verbose
/// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
template <typename T>
class reversed {
class Reversed {
public:
typedef typename T::const_reverse_iterator iterator;
explicit reversed(T const &container) : container_(container){};
using iterator = typename T::const_reverse_iterator;
explicit Reversed(T const &container) : container_(container){};
iterator begin() const { return container_.rbegin(); }
iterator end() const { return container_.rend(); }
@ -83,11 +96,10 @@ class reversed {
T const &container_;
};
bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
return set.find(name) != set.end();
}
void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
/// When comparing two tag stacks, determine which tags need to be closed and
/// opened to get from one stack to the other.
void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
HTML::TagStack &closing) {
opening.clear();
closing.clear();
@ -98,9 +110,11 @@ void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &ope
if (i >= curr.size() || prev[i] != curr[i]) break;
// Only nodes of type ELEMENT can have children and thus would need a closing tag.
// NOLINTNEXTLINE(bugprone-narrowing-conversions)
std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
[&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });
// NOLINTNEXTLINE(bugprone-narrowing-conversions)
opening.insert(opening.end(), curr.begin() + i, curr.end());
}
@ -108,42 +122,24 @@ bool intersects(ByteRange const &range, HTML::Span const &span) {
return range.begin <= span.end && range.end >= span.begin;
};
bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
}
template <typename Fun>
AnnotatedText apply(AnnotatedText const &in, Fun fun) {
AnnotatedText out;
/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few
/// more nested deeper.
bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
if (a.size() > b.size()) return false;
for (size_t sentenceIdx = 0; sentenceIdx < in.numSentences(); ++sentenceIdx) {
std::string sentence;
std::vector<ByteRange> tokens;
for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
if (*i != *j) return false;
std::string prefix = fun(in.annotation.gap(sentenceIdx), in.gap(sentenceIdx), false);
for (size_t wordIdx = 0; wordIdx < in.numWords(sentenceIdx); ++wordIdx) {
std::string token = fun(in.wordAsByteRange(sentenceIdx, wordIdx), in.word(sentenceIdx, wordIdx), false);
tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
sentence += token;
}
// Convert our ByteRanges to string_views since that's what appendSentence
// expects
// TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
// directly
std::vector<string_view> views(tokens.size());
std::transform(tokens.begin(), tokens.end(), views.begin(),
[&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); });
out.appendSentence(prefix, views.begin(), views.end());
}
out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true));
return out;
return true;
}
/// Tests whether `response` has alignment info associated with it or not.
bool hasAlignments(Response const &response) {
// Test for each sentence individually as a sentence may be empty (or there)
// might be no sentences, so just testing for alignments.empty() would not be
@ -162,11 +158,12 @@ bool hasAlignments(Response const &response) {
return true;
}
// Little helper class to append HTML to a token
/// Helper class to append HTML tags to a token. Also makes sure the token is
/// encoded as valid HTML.
class TokenFormatter {
public:
explicit TokenFormatter(string_view token)
: html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
explicit TokenFormatter(marian::string_view token)
: offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
// Do encoding of any entities that popped up in the translation
encodeEntities(token, html_);
}
@ -174,12 +171,12 @@ class TokenFormatter {
std::string &&html() { return std::move(html_); }
// Append the markup necessary for moving from `prev` set of tags to `curr`.
void append(HTML::Taint const &prev, HTML::Taint const &curr) {
HTML::Taint opening, closing;
void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
HTML::TagStack opening, closing;
diffTags(prev, curr, opening, closing);
for (HTML::Tag const *tag : reversed(closing)) {
for (HTML::Tag const *tag : Reversed(closing)) {
assert(tag->type == HTML::Tag::ELEMENT);
std::string closeTag = format("</{}>", tag->name);
html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
@ -232,6 +229,8 @@ class TokenFormatter {
bool closeLeft_;
};
/// Count the number of tokens in an AnnotatedText. Used to assert we're not
/// running out of sync when creating vectors that describe each token.
size_t debugCountTokens(AnnotatedText const &text) {
size_t tokens = 1; // for the ending gap
for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@ -240,11 +239,87 @@ size_t debugCountTokens(AnnotatedText const &text) {
return tokens;
}
/// Helper function that consumes a tag as if it is a special tag, except that
/// it takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
/// to determine whether this was an element that needed to be ignored.
void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
// Only full elements can be consumed this way. With void tags we don't know
// where to stop scanning. All other types cannot be nested anyway.
assert(tag.type == HTML::Tag::ELEMENT);
// TT_TAG_START is already consumed.
markup::Scanner::TokenType token;
size_t inside = 0;
// Consume the full open tag, i.e. all its attributes
while (!inside) {
token = scanner.next();
switch (token) {
case markup::Scanner::TT_ERROR:
ABORT("HTML parse error");
case markup::Scanner::TT_EOF:
ABORT("Did not find closing tag </{}>", name);
case markup::Scanner::TT_ATTRIBUTE:
tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
break;
default:
// Not an attribute! Must be something inside the body or the closing
// tag already. Time to jump to the next loop.
++inside;
break;
}
}
// Last token was something that would have triggered Scanner::scanBody(),
// which sets value() to start pointing at the body.
const char *start = scanner.start();
// Consume the rest of the HTML until (including) the final closing tag. We
// start with the token that caused the previous loop to fall into the default
// case.
while (inside) {
switch (token) {
case markup::Scanner::TT_ERROR:
ABORT("HTML parse error");
case markup::Scanner::TT_EOF:
ABORT("Did not find closing tag </{}>");
case markup::Scanner::TT_TAG_START:
// Note: Looking specifically for only our own type of tag so we don't
// have to care about whether other tags we encounter are void tags or
// not. Does assume the HTML is valid, as no stack is kept.
if (toLowerCase(scanner.tag()) == name) ++inside;
break;
case markup::Scanner::TT_TAG_END:
if (toLowerCase(scanner.tag()) == name) --inside;
break;
default:
break;
}
// Only continue scanning if we're still inside. We could have just read the
// TT_TAG_END token that ended this element, and we don't want to continue
// consuming tokens at that point.
if (inside) token = scanner.next();
}
// Only a TAG_END could have stopped the previous loop. We take the start
// of the final closing tag as the end of our data.
assert(token == markup::Scanner::TT_TAG_END);
const char *end = scanner.start();
// All data between the end of the first open element, and the start of the
// last close element, we just treat as raw data that will be printed when
// this tag is eventually printed.
assert(end >= start);
tag.data = std::string_view(start, end - start);
}
} // namespace
namespace marian::bergamot {
// Formatters used for exception messages combined with format()
/// Formatters used for formatting error messages in ABORT() calls.
std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
if (tag == nullptr) return out << "[nullptr]";
switch (tag->type) {
@ -262,7 +337,7 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
return out << "[Unknown tag type]";
}
std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
for (auto it = tags.begin(); it != tags.end(); ++it) {
if (it != tags.begin()) out << ' ';
out << *it;
@ -270,18 +345,20 @@ std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
return out;
}
HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
if (!process_markup) return;
HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
if (!processMarkup) return;
std::string original = std::move(source);
markup::instream in(original.data(), original.data() + original.size());
markup::Scanner scanner(in);
source.clear(); // source is moved out of, so should be clear anyway
Tag *tag;
Taint stack;
bool addSentenceBreak = false;
bool addSpace = false;
Tag *tag = nullptr; // current tag (after opening at least)
TagStack stack; // stack of currently open tags
bool addSentenceBreak = false; // whether to add a sentence break next text segment
bool addWordBreak = false; // whether to add a word break next text segment
// Starting point: an empty span with no open tags.
spans_.push_back(Span{0, 0, {}});
bool stop = false;
@ -298,13 +375,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
// If the previous segment was the open or close tag of a block element
// we treat the text after it as a new sentence.
if (addSentenceBreak) {
if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
// If there isn't already a \n\n at the end of source...
if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
stack.push_back(makeTag({Tag::WHITESPACE}));
// Important: span->size() == 0 to make it behave as a void element.
// Also important: position before the \n\n tokens, not after, to
// make it easier to remove them later through apply().
spans_.push_back(Span{source.size(), source.size(), stack});
source.append("\n\n"); // TODO assumes ssplit-mode = wrapped_text
source.append("\n\n"); // Should work with ssplit-mode = wrapped_text
stack.pop_back();
}
addSentenceBreak = false;
@ -312,24 +390,27 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
// If the previous segment was an open or close tag, it might be best
// to add a space to make sure we don't append to the previous word.
if (addSpace) {
if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
!std::isspace(scanner.value()[0])) {
if (addWordBreak) {
// Only add the space when it would be inside a word. Do not add it if
// it would be between a word and punctuation.
if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
source.push_back(' ');
}
addSpace = false;
addWordBreak = false;
}
// Store which tags were open when this span of text was encountered.
auto begin = source.size();
source.append(scanner.value());
spans_.push_back(Span{begin, source.size(), stack});
} break;
case markup::Scanner::TT_TAG_START: {
std::string name(scanner.tag());
std::string name = toLowerCase(scanner.tag());
// Tag *tag is used by attribute parsing
tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
tag = makeTag({type, std::string(scanner.tag())});
stack.push_back(tag);
@ -341,39 +422,48 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
stack.pop_back();
}
// Ignored tags have same semantics as void tags with regards to moving
// them around with the rest of the content.
if (contains(options_.ignoredTags, name)) {
consumeIgnoredTag(scanner, *tag, name);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
}
// Treat non-inline HTML tags as spaces that break up words.
if (!contains(options_.inlineTags, tag->name)) {
if (!contains(options_.inlineTags, name)) {
addSentenceBreak = true;
} else {
addSpace = true;
} else if (!contains(options_.inWordTags, name)) {
addWordBreak = true;
}
} break;
case markup::Scanner::TT_TAG_END:
case markup::Scanner::TT_TAG_END: {
std::string tagName = toLowerCase(scanner.tag());
// If this is the closing bit of a void tag, i.e. triggered by the "/>"
// bit of "<img/>", then completely ignore it.
if (contains(options_.voidTags, std::string(scanner.tag()))) break;
if (contains(options_.voidTags, tagName)) break;
ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag());
ABORT_IF(stack.back()->name != scanner.tag(), "Encountered unexpected closing tag </{}>, stack is {}",
scanner.tag(), stack);
ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()),
"Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack);
// What to do with "<u></u>" case, where tag is immediately closed
// so it never makes it into the taint of any of the spans? This adds
// an empty span so it still gets recorded in spans_.
if (spans_.empty() || !containsTag(spans_.back().tags, stack.back()))
if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
// Add space if necessary
if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
if (!contains(options_.inlineTags, tagName)) {
addSentenceBreak = true;
} else {
addSpace = true;
} else if (!contains(options_.inWordTags, tagName)) {
addWordBreak = true;
}
break;
} break;
case markup::Scanner::TT_ATTRIBUTE:
assert(tag != nullptr);
@ -448,10 +538,10 @@ void HTML::restore(Response &response) {
// Find for every token in target the token in source that best matches.
std::vector<std::vector<size_t>> alignments;
hardAlignments(response, alignments);
hardAlignments(response, alignments, sourceTokenSpans);
std::vector<SpanIterator> targetTokenSpans;
copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
assert(targetTokenSpans.size() == debugCountTokens(response.target));
AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
@ -466,7 +556,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
// and the while-loop below will do the rest
assert(prevIt == spans_.end() || prevIt->tags.empty());
return apply(in, [&](ByteRange range, string_view token, bool last) {
return in.apply([&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// Potential issue: spans and tokens can intersect, e.g.
@ -475,9 +565,11 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
// tokens |111111111111111|2|
//
// Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
// Note: only relevant if isBlockElement is used. If we just insert spaces
// around all elements, every segment of `hello` will be a token.
// Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
// `<p><u>`?
// Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
// just insert spaces around all elements, every segment of `hello` will be
// a token.
// Seek to the last span that overlaps with this token
while (true) {
@ -494,7 +586,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
// TODO: This is just the taint of the last span, not the ones in between.
// This makes us lose some markup of parts of tokens as described above.
sourceTokenSpans.push_back(prevIt);
sourceTokenSpans.emplace_back(prevIt);
return std::move(formatter.html());
});
@ -503,27 +595,28 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
auto prevSpan = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
auto straggerSpanIt = spans_.cbegin();
AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
if (span_it->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
if (straggerSpanIt->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end())
continue;
formatter.append(prevSpan->tags, span_it->tags);
formatter.append(prevSpan->tags, straggerSpanIt->tags);
// Note: here, not in 3rd part of for-statement because we don't want to
// set prevSpan if the continue clause at the beginning of this for-loop
// was hit.
prevSpan = span_it;
prevSpan = straggerSpanIt;
}
// Now do the same thing but for our target set of tags. Note that we cannot
@ -539,7 +632,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append((*targetSpanIt)->tags, HTML::Taint());
formatter.append((*targetSpanIt)->tags, HTML::TagStack());
}
prevSpan = *targetSpanIt;
@ -559,8 +652,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) {
return &pool_.front();
}
void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<SpanIterator> const &sourceTokenSpans,
std::vector<SpanIterator> &targetTokenSpans) {
size_t offset = 0; // Sentence offset in sourceTokenSpans
// Fill targetTokenSpans based on the alignments we just made up.
@ -584,14 +678,25 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
bool HTML::isContinuation(string_view prev, string_view str) {
bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
if (options_.continuationDelimiters.empty()) return false;
if (prev.empty() || str.empty()) return false;
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
}
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
}
/// Selects for each token in `response.target` a best source token from
/// `response.source` and writes this selection to `alignments`. The source
/// token spans are used to also look at the markup applied to each token to
/// figure out which source token best represents each target token.
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
std::vector<SpanIterator> const &sourceTokenSpans) {
size_t offset = 0; // sentence offset in sourceTokenSpans
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
@ -600,14 +705,9 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
size_t maxS = 0;
for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
maxS = s;
}
}
alignments.back().push_back(maxS);
alignments.back().push_back(
std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
response.alignments[sentenceIdx][t].begin());
}
// Next, we try to smooth out these selected alignments with a few heuristics
@ -622,7 +722,14 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
if (currScore >= prevScore) {
TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
// If this token has more markup, or a better score than the previous
// token (and they together are part of a word-ish thing) then mark
// this word as aligning. Otherwise just copy the alignment source of
// the previous token.
if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
@ -640,6 +747,8 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
}

View File

@ -2,51 +2,123 @@
#define SRC_BERGAMOT_HTML_H_
#include <forward_list>
#include <set>
#include <stdexcept>
#include <string>
#include <unordered_set>
#include <string_view>
#include "annotation.h"
#include "data/types.h"
#include "definitions.h"
namespace marian {
namespace bergamot {
namespace marian::bergamot {
struct Response;
/// HTML class parses and removes HTML from input text, and places it back into
/// the translated output text.
///
/// When parsing the HTML, it treats tags as markup, where a list of nested tags
/// can be seen as a list of markups that are applicable to all the text that
/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
/// closes, a new TagStack is created to reflect that. TagStack used to be
/// called `Taint` because it *tainted* the text it was associated with with
/// those tags as markup. The text between tags themselves is stored in the
/// input variable. In `spans_`, the TagStack that is associated with a
/// substring of that text is stored.
/// When transferring the HTML from the source text to the translated target
/// text, the TagStacks are first associated with each of the subwords from the
/// source text. Using hard alignment, each subword in the source text is linked
/// to a subword in the target text. The TagStacks are then copied over these
/// links. Finally, the HTML is inserted back into the target text by for each
/// subword, comparing the TagStack from the previous word to that word, and
/// opening and closing elements to make up for the difference.
///
/// There are a couple of complexities though:
/// 1. Not all tags can be treated as markup applied to text. For example, an
/// `<img>` does not contain text itself. Or `<i></i>` does not. We do want
/// those tags to remain in the output though. We do this by associating
/// them to an empty `Span`. When inserting HTML back into the translation
/// input or output, we keep track of where in the `spans_` vector we are,
/// and insert any elements from empty spans that we might have skipped over
/// because empty spans are never linked to tokens/subwords. These are
/// *stragglers* in some parts of the code, or *void* or *empty* elements in
/// other parts.
/// 2. Some tags should be treated as paragraph indicators, and break up
/// sentences. These are the usual suspects like `<p>`, but also `<li>` and
/// `<td>`, to make sure we don't translate two table cells into a single
/// word. This is the `addSentenceBreak` flag in the HTML parsing bit.
/// We mark these breaks with `\n\n` in the input text and with a special
/// WHITESPACE tag that we treat as any other void tag. Hopefully this tag
/// moves with the added `\n\n` and it is easy for us to remove it again.
/// (in practise it is since these only occur at the end of sentences and
/// the end of sentences are always aligned between source and target.)
/// 3. We treat most tags as word-breaking. We do this by adding spaces just
/// after where we saw the open or close tag occur. If there is already
/// some whitespace in that place, we do not add extra spaces.
/// 4. TODO
class HTML {
public:
using TagNameSet = std::set<std::string, std::less<>>;
/// Options struct that controls how HTML is interpreted.
struct Options {
// List of elements for which we do not expect a closing tag, or self-closing
// elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
// More relevant source of this list:
// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
std::unordered_set<std::string> voidTags{"area", "base", "basefont", "bgsound", "br", "col",
"embed", "frame", "hr", "img", "input", "keygen",
"link", "meta", "param", "source", "track", "wbr"};
/// List of elements for which we do not expect a closing tag, or
/// self-closing elements in XHTML. We do not need to see a closing tag
/// for these elements, and they cannot contain text or tags themselves.
/// See also:
/// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
/// More relevant source of this list:
/// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
TagNameSet voidTags{"area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr",
"img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"};
std::unordered_set<std::string> inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
"output", "q", "ruby", "small", "span", "strong", "sub", "sup",
"time", "u", "var", "wbr", "ins", "del", "img"};
/// List of elements that are treated as inline, meaning they do not break
/// up sentences. Any element *not* in this list will cause the text that
/// follows its open or close tag to be treated as a separate sentence.
TagNameSet inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
"output", "q", "ruby", "small", "span", "strong", "sub", "sup",
"time", "u", "var", "wbr", "ins", "del", "img"};
// List of characters that occur at the start of a token that indicate that
// the this token is probably *not* a continuation of a word. Set to empty
// to never mark a token as a continuation of the word.
// std::string continuationDelimiters = "\n ,.(){}[]";
std::string continuationDelimiters;
/// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
/// not substituted with spaces. Technically almost all inline elements
/// should be treated like this, except `<br>` maybe, But in practice it
/// seems to be more effective to limit this set to just that one tag that
/// that can only really be used *inside* words: `<wbr>`.
/// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
TagNameSet inWordTags{"wbr"};
// Should we always add spaces to the places where tags used to be? I.e.
// `un<u>der</u>line` should become `un der line`?
/// List of elements we copy as is, but do parse as if they're HTML because
/// they could be nested. For <script> we just scan for </script> because
/// the script tag may not be nested, but that is not the case for these
/// elements per se. Some tags, like <script>, are ignored at the `Scanner`
/// level. See `xh_scanner.cpp/Scanner::scanAttribute()`.
TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
/// List of characters that occur at the start of a token that indicate that
/// the this token is probably *not* a continuation of a word. This is also
/// used to determine whether there should be a space after a closing tag
/// or not. I.e. a `.` after a `</strong>` does not need to be separated by
/// an extra space.
std::string continuationDelimiters = "\n ,.(){}[]";
/// Should we always add spaces to the places where tags used to be? I.e.
/// `un<u>der</u>line` should become `un der line`? This does help with
/// retaining tags inside words, or with odd pages that use CSS to add
/// spacing between a lot of tags. Cases like `<td>` and `<li>` are already
/// covered by treating them as sentence splitting.
bool substituteInlineTagsWithSpaces = true;
};
/// Represents a tag, or markup that is being applied to a string of text.
/// We treat all elements except `ELEMENT` as void elements or empty elements.
struct Tag {
enum NodeType {
ELEMENT,
VOID_ELEMENT,
COMMENT,
PROCESSING_INSTRUCTION,
WHITESPACE, // negative space
ELEMENT, // <b>...</b>
VOID_ELEMENT, // <img>
COMMENT, // <!-- ... -->
PROCESSING_INSTRUCTION, // <?...?>
WHITESPACE, // A \n\n we inserted to break a sentence.
};
NodeType type; // Type of the node
@ -55,48 +127,94 @@ class HTML {
// entities and prefix whitespace)
std::string data; // Raw data of an element that just needs to be
// copied as is, e.g. <script> or <style>
// @TODO: if the original HTML stays in memory, we could replace
// `attributes` and `data` with string_views pointing to it.
};
using Taint = std::vector<Tag *>;
/// Representation of markup that is being applied to a string of text. Order
/// matters as this represents how the tags are nested. The `Tag` objects
/// themselves are owned by `pool_`.
using TagStack = std::vector<Tag *>;
/// Span of text, with which a `TagStack` is associated. A span may be empty,
/// for example to represent the presence of an empty or VOID element.
struct Span {
size_t begin;
size_t end;
Taint tags; // Note: free pointers! Lifetime of tags is managed by pool_
size_t begin; // Start offset in (plain text) source
size_t end; // end offset in source
TagStack tags; // Note: free pointers to memory owned by `pool_`.
inline size_t size() const { return end - begin; }
};
explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
explicit HTML(std::string &&source, bool process_markup, Options &&options);
/// Parses HTML in `source` (if `processMarkup` is true). `source` is updated
/// to only contain the plain text extracted from the HTML. `HTML` instance
/// retains information about what tags are extracted from where to later
/// reconstruct the HTML in a `Response` object (both `source` and `target`).
explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
explicit HTML(std::string &&source, bool processMarkup, Options &&options);
/// It is not save to copy a HTML instance.
HTML(const HTML &) = delete;
/// Moving is fine
HTML(HTML &&) = default;
/// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
/// and uses alignment information to also reconstruct the same markup in
/// `response.target`.
void restore(Response &response);
private:
using SpanIterator = std::vector<HTML::Span>::const_iterator;
using AnnotatedText = marian::bergamot::AnnotatedText;
/// Reconstructs HTML in `response.source` (passed as `in`) and makes a list
/// `sourceTokenSpans` that associates a `Span` with each subword in `in`.
/// We later use these span pointers to copy tags. They're iterators (or
/// pointers into a list) to be able to compare whether one span came before
/// or after another span.
AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
/// Inserts the HTML into `response.target` (passed as `in`) based on
/// `targetTokenSpans`, which points to a `Span` for each token (subword) in
/// `response.target`.
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<HTML::SpanIterator> const &sourceTokenSpans,
std::vector<HTML::SpanIterator> &targetTokenSpans);
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
bool isContinuation(string_view prev, string_view str);
// Allocates tag in pool_ (which then owns it) and gives a pointer to be used
// in Taints. Pointer is valid as long as this HTML instance lives on.
/// Utilities to test whether subword `str` is part of a word together with
/// the subword `prev`, or a separate word. Basically *does `str` start with
/// a space, but bit more complex to deal with punctuation.
bool isContinuation(marian::string_view prev, marian::string_view str) const;
bool isContinuation(std::string_view prev, std::string_view str) const;
/// Copies span pointers from the subwords/tokens from the source text to the
/// subwords of the target text in `targetTokenSpans` using alignment
/// information in `response`.
void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<HTML::SpanIterator> const &sourceTokenSpans,
std::vector<HTML::SpanIterator> &targetTokenSpans);
/// Turns the alignment scores in `response.alignments` into one source token
/// per target token. Has some heuristics to keep all target tokens of a
/// single word pointing to the same span, and prefers spans with more markup
/// over spans with less to try to retain as much of the input markup as
/// possible.
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
std::vector<HTML::SpanIterator> const &sourceTokenSpans);
/// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
/// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
Tag *makeTag(Tag &&tag);
/// HTML options associated with this parse.
Options options_;
// List of text spans, and which tags are applied to them
/// List of spans of text in plain text `source`, and which tags are applied
/// to them.
std::vector<Span> spans_;
// a pool of tags that we free when HTML goes out of scope
/// A pool of tags. `std::forward_list` because we do not want pointers to it
/// to be invalidated when new tags are allocated. This way it is easy to
/// deallocate them all when `HTML` goes out of scope.
std::forward_list<Tag> pool_;
};
} // namespace bergamot
} // namespace marian
} // namespace marian::bergamot
#endif // SRC_BERGAMOT_HTML_H_

View File

@ -19,7 +19,7 @@ struct ResponseOptions {
bool qualityScores{false}; ///< Include quality-scores or not.
bool alignment{false}; ///< Include alignments or not.
bool HTML{false}; /// Remove HTML tags from text and (TODO) insert in output.
bool HTML{false}; /// Remove HTML tags from text and insert in output.
/// Whether to include sentenceMappings or not. Alignments require
/// sentenceMappings and are available irrespective of this option if

View File

@ -37,6 +37,11 @@ bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
}
template <size_t N>
constexpr size_t length(char const (&/*unused*/)[N]) {
return N - 1;
}
} // end namespace
namespace markup {
@ -52,6 +57,8 @@ std::string_view Scanner::tag() const { return std::string_view(tagName_.data, t
Scanner::TokenType Scanner::scanBody() {
value_ = string_ref{input_.pos(), 0};
start_ = input_.pos();
switch (input_.peek()) {
case '\0':
return TT_EOF;
@ -97,15 +104,16 @@ Scanner::TokenType Scanner::scanAttribute() {
switch (input_.peek()) {
case '>':
input_.consume();
if (equalsCaseInsensitive(tagName_, "script")) {
// Treat some elements as opaque, e.g. <script>, <style>
if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
// script is special because we want to parse the attributes,
// but not the content
scanFun_ = &Scanner::scanSpecial;
return scanSpecial();
} else if (equalsCaseInsensitive(tagName_, "style")) {
// same with style
scanFun_ = &Scanner::scanSpecial;
return scanSpecial();
} else {
scanFun_ = &Scanner::scanBody;
return scanBody();
@ -198,10 +206,11 @@ Scanner::TokenType Scanner::scanAttribute() {
// - TT_ENTITY_START
// - TT_ERROR if unexpected character or end
Scanner::TokenType Scanner::scanTag() {
start_ = input_.pos();
if (input_.consume() != '<') return TT_ERROR;
bool is_tail = input_.peek() == '/';
if (is_tail) input_.consume();
bool isTail = input_.peek() == '/';
if (isTail) input_.consume();
tagName_ = string_ref{input_.pos(), 0};
@ -226,7 +235,7 @@ Scanner::TokenType Scanner::scanTag() {
if (!input_.peek()) return TT_EOF;
if (is_tail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
scanFun_ = &Scanner::scanAttribute;
return TT_TAG_START;
@ -234,6 +243,7 @@ Scanner::TokenType Scanner::scanTag() {
Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
// `entity` includes starting '&' and ending ';'
start_ = input_.pos();
string_ref entity{input_.pos(), 0};
bool hasEnd = false;
@ -312,11 +322,13 @@ bool Scanner::isWhitespace(char c) {
Scanner::TokenType Scanner::scanComment() {
if (gotTail_) {
start_ = input_.pos() - length("-->"); // minus "-->"
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_COMMENT_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
@ -325,7 +337,7 @@ Scanner::TokenType Scanner::scanComment() {
if (endsWith(value_, "-->")) {
gotTail_ = true;
value_.size -= 3;
value_.size -= length("-->");
break;
}
}
@ -334,11 +346,13 @@ Scanner::TokenType Scanner::scanComment() {
Scanner::TokenType Scanner::scanProcessingInstruction() {
if (gotTail_) {
start_ = input_.pos() - length("?>");
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_PROCESSING_INSTRUCTION_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
@ -347,7 +361,7 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
if (endsWith(value_, "?>")) {
gotTail_ = true;
value_.size -= 2;
value_.size -= length("?>");
break;
}
}
@ -356,11 +370,13 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
Scanner::TokenType Scanner::scanSpecial() {
if (gotTail_) {
start_ = input_.pos() - (tagName_.size + length("</>"));
scanFun_ = &Scanner::scanBody;
gotTail_ = false;
return TT_TAG_END;
}
start_ = input_.pos();
value_ = string_ref{input_.pos(), 0};
while (true) {
@ -369,17 +385,17 @@ Scanner::TokenType Scanner::scanSpecial() {
// Test for </tag>
// TODO: no whitespaces allowed? Is that okay?
if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + 3) {
if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
// Test for the "</"" bit of "</tag>"
size_t pos_tag_start = value_.size - tagName_.size - 3;
if (std::memcmp(value_.data + pos_tag_start, "</", 2) != 0) continue;
size_t posTagStart = value_.size - tagName_.size - length("</>");
if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;
// Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
size_t pos_tag_name = value_.size - tagName_.size - 1; // end - tag>
if (!equalsCaseInsensitive(value_.data + pos_tag_name, tagName_.data, tagName_.size)) continue;
size_t posTagName = value_.size - tagName_.size - length(">"); // end - tag>
if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;
gotTail_ = true;
value_.size -= tagName_.size + 3;
value_.size -= tagName_.size + length("</>");
break;
}
}

View File

@ -83,6 +83,7 @@ class Scanner {
tagName_{nullptr, 0},
attributeName_{nullptr, 0},
input_(is),
start_(nullptr),
scanFun_(&Scanner::scanBody),
gotTail_(false) {}
@ -98,6 +99,8 @@ class Scanner {
// get tag name
std::string_view tag() const;
inline const char *start() const { return start_; }
private: /* methods */
typedef TokenType (Scanner::*ScanPtr)();
@ -137,6 +140,9 @@ class Scanner {
instream &input_;
// Start position of a token.
const char *start_;
bool gotTail_; // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
};
} // namespace markup