Treat most HTML elements as word-breaking (#286)

This commit is contained in:
Jelmer 2022-01-16 10:26:40 +00:00 committed by GitHub
parent 13c55e2693
commit e061b5613e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 595 additions and 278 deletions

2
.gitignore vendored
View File

@ -19,7 +19,7 @@ _deps
wasm/test_page/node_modules
build-wasm
models
wasm/test_page/bergamot-translator-worker.*
wasm/test_page/js/bergamot-translator-worker.*
# VSCode
.vscode

@ -1 +1 @@
Subproject commit 332e976df4583793a09b6483b80b972621fcfadb
Subproject commit b46987e96fc27b7e9488fbc36b53c07e1786784c

View File

@ -169,24 +169,136 @@ TEST_CASE("Test case html entities") {
// These are all entities I would expect in innerHTML, since all other entities
// can be encoded as UTF-8 so there's no need to encode them through &...; when
// innerHTML encodes the DOM as HTML.
std::string input("<p data-attr=\"&quot;&apos;\">This is a sentence &lt;with&gt; named &amp; entities</p>\n");
std::string input("<p data-attr=\"&quot;&apos;\">This is a sentence &lt;with&gt; named &amp; entities</p>");
HTML html(std::move(input), true);
CHECK(input == "This is a sentence <with> named & entities\n");
CHECK(input == "This is a sentence <with> named & entities");
}
TEST_CASE("Test self-closing tags should be treated as spaces") {
std::string input("<p>Space<br>please?</p>\n");
TEST_CASE("Test self-closing tags should be treated as paragraph break") {
std::string test_str("<p>Space<br>please?</p>");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "Space please?\n");
CHECK(input == "Space\n\nplease?");
Response response;
std::string source_str("Space\n\nplease?");
std::vector<string_view> source_tokens{
string_view(source_str.data() + 0, 5), // Space
string_view(source_str.data() + 5, 0), // [EOS]
string_view(source_str.data() + 5, 2), // \n\n
string_view(source_str.data() + 7, 1), // p
string_view(source_str.data() + 8, 5), // lease
string_view(source_str.data() + 13, 1), // ?
string_view(source_str.data() + 14, 0), // EOS
};
response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2);
response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end());
std::string target_str("Platz\n\nbitte?");
std::vector<string_view> target_tokens{
string_view(target_str.data() + 0, 5), // Platz
string_view(target_str.data() + 5, 0), // [EOS]
string_view(target_str.data() + 5, 2), // \n\n
string_view(target_str.data() + 7, 5), // bitte
string_view(target_str.data() + 12, 1), // ?
string_view(target_str.data() + 13, 0), // [EOS]
};
response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2);
response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end());
response.alignments = {{
{1.0, 0.0}, // Platz <- Space
{0.0, 1.0} // [EOS] <- [EOS]
},
{
{0.1, 0.9, 0.0, 0.0}, // _bitte <- _p + lease
{0.0, 0.0, 1.0, 0.0}, // ? <- ?
{0.0, 0.0, 0.0, 1.0}, // [EOS] <- [EOS]
}};
// Main focus of this test is that the space that was introduced in the text
// that was being translated does not end up in the translation.
html.restore(response);
CHECK(response.source.text == "<p>Space<br>please?</p>");
CHECK(response.target.text == "<p>Platz<br>bitte?</p>");
}
TEST_CASE("Test inline tags should be treated as spaces") {
std::string test_str("un<u>der</u>line");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "un der line");
Response response;
std::string source_str("un der line");
std::vector<string_view> source_tokens{
string_view(source_str.data() + 0, 2), // un
string_view(source_str.data() + 2, 3), // _de
string_view(source_str.data() + 5, 1), // r
string_view(source_str.data() + 6, 5), // _line
string_view(source_str.data() + 11, 0), // EOS
};
response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
std::string target_str("una linea der");
std::vector<string_view> target_tokens{
string_view(target_str.data() + 0, 3), // una
string_view(target_str.data() + 3, 6), // _linéa
string_view(target_str.data() + 9, 3), // _de
string_view(target_str.data() + 12, 1), // r
string_view(target_str.data() + 13, 0), // [EOS]
};
response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009},
{0.0098, 0.2967, 0.0156, 0.6640, 0.0138},
{0.0214, 0.7472, 0.0626, 0.0745, 0.0943},
{0.0022, 0.0230, 0.9357, 0.0165, 0.0226},
{0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}};
html.restore(response);
CHECK(response.source.text == "un <u>der</u> line"); // TODO leave spaces?
CHECK(response.target.text == "una linea <u>der</u>");
}
TEST_CASE("Test inline tags should not break words") {
std::string test_str("un<u>der</u>line");
std::string input(test_str);
HTML::Options options;
options.substituteInlineTagsWithSpaces = false;
HTML html(std::move(input), true, std::move(options));
CHECK(input == "underline");
Response response;
std::string source_str("underline");
std::vector<string_view> source_tokens{
string_view(source_str.data() + 0, 9), // underline
string_view(source_str.data() + 9, 0), // EOS
};
response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
std::string target_str("subrayar");
std::vector<string_view> target_tokens{
string_view(target_str.data() + 0, 8), // subrayar
string_view(target_str.data() + 8, 0), // [EOS]
};
response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
response.alignments = {identity_matrix<float>(2)};
html.restore(response);
CHECK(response.source.text == "<u></u>underline"); // TODO not spread <u> to whole word?
CHECK(response.target.text == "<u></u>subrayar"); // TODO not spread <u> to the whole word?
}
TEST_CASE("Test reconstruction of target sentence") {
std::string input("<p>hello <b>world</b></p>\n");
HTML html(std::move(input), true);
CHECK(input == "hello world\n");
CHECK(input == "hello world\n\n\n"); // tripple \n because \n + </p>
AnnotatedText source("hello world\n");
AnnotatedText source("hello world\n\n\n");
recordSentenceFromByteRange(source, {
ByteRange{0, 4}, // 0.0 "hell"
ByteRange{4, 5}, // 0.1 "o"
@ -194,7 +306,7 @@ TEST_CASE("Test reconstruction of target sentence") {
ByteRange{11, 11} // 0.3 ""
});
AnnotatedText target("hallo Welt\n");
AnnotatedText target("hallo Welt\n\n\n");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@ -218,11 +330,11 @@ TEST_CASE("Test reconstruction of target sentence") {
}
TEST_CASE("Test reconstruction of target sentence with entities") {
std::string input("<p>hello <b>world &amp; friends!</b></p>\n");
std::string input("<p>hello <b>world &amp; friends!</b></p>");
HTML html(std::move(input), true);
CHECK(input == "hello world & friends!\n");
CHECK(input == "hello world & friends!");
AnnotatedText source("hello world & friends!\n");
AnnotatedText source("hello world & friends!");
recordSentenceFromByteRange(source, {
ByteRange{0, 4}, // 0.0 "hell"
ByteRange{4, 5}, // 0.1 "o"
@ -233,7 +345,7 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
ByteRange{22, 22} // 0.6 ""
});
AnnotatedText target("hallo Welt & Freunde!\n");
AnnotatedText target("hallo Welt & Freunde!");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@ -252,11 +364,11 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
html.restore(response);
std::vector<std::string> html_tokens_source{"", "<p>hell", "o", " <b>world", " &amp;",
" friends", "!", "", "</b></p>\n"};
" friends", "!", "", "</b></p>"};
std::vector<std::string> html_tokens_target{"", "<p>hall", "o", " <b>Welt", " &amp;",
std::vector<std::string> html_tokens_target{"", "<p>hall", "o", " <b>Welt", " &amp;",
" Freunde", "!", "", "</b></p>\n"};
" Freunde", "!", "", "</b></p>"};
CHECK(asTokens(response.source) == html_tokens_source);
CHECK(asTokens(response.target) == html_tokens_target);
@ -264,10 +376,10 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
TEST_CASE("Test reconstruction of target with multiple sentences") {
std::string input(
"<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>\n");
"<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>");
HTML html(std::move(input), true);
AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?\n");
AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?");
CHECK(source.text == input);
recordSentenceFromByteRange(source, {
@ -297,7 +409,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
ByteRange{71, 71} // 2.4 ""
});
AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?\n");
AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@ -327,7 +439,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
std::vector<std::string> text_tokens_source{
"", "hall", "o", " Welt", "!", "", " ", "Wie", " geht", " das", " mit", " mehreren",
" Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", "\n"};
" Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", ""};
CHECK(asTokens(target) == text_tokens_source);
@ -360,26 +472,56 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
" work",
"?",
"",
"</p>\n"};
"</p>"};
CHECK(asTokens(response.source) == html_tokens_source);
}
TEST_CASE("Test self-closing tag (HTML5)") {
std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>\n");
std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>");
HTML html(std::move(input), true);
CHECK(input == "hello world and other creatures\n"); // Note double space between "hello" and "world"
CHECK(input == "hello world and other creatures"); // Note double space between "hello" and "world"
}
TEST_CASE("Test empty self-closing tag at end of input") {
TEST_CASE("Test empty void tag at end of input") {
std::string input("hello <br>");
HTML html(std::move(input), true);
CHECK(input == "hello ");
Response response;
std::string sentence_str("hello ");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 4), // 0.0 hell
string_view(sentence_str.data() + 4, 2), // 0.1 o_
string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.target.appendSentence("", sentence.begin(), sentence.end());
response.alignments = {identity_matrix<float>(3)};
html.restore(response);
CHECK(response.source.text == "hello <br>");
CHECK(response.target.text == "hello <br>");
}
TEST_CASE("Test empty tag pair at end of input") {
std::string input("hello <u></u>");
HTML html(std::move(input), true);
CHECK(input == "hello ");
Response response;
std::string sentence_str("hello ");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 4), // 0.0 hell
string_view(sentence_str.data() + 4, 2), // 0.1 o_
string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.target.appendSentence("", sentence.begin(), sentence.end());
response.alignments = {identity_matrix<float>(3)};
html.restore(response);
CHECK(response.source.text == "hello <u></u>");
CHECK(response.target.text == "hello <u></u>");
}
TEST_CASE("Test empty self-closing pair at end of input in parent") {
@ -391,11 +533,11 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") {
TEST_CASE("Test empty tag") {
std::string test_str(
"<p id=\"1\">hello <img id=\"1.1\"><span id=\"1.2\"><u id=\"1.2.1\"></u><b id=\"1.2.2\"></b><img "
"id=\"1.2.3\">world</span></p>\n");
"id=\"1.2.3\">world</span></p>");
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "hello world\n");
CHECK(input == "hello world");
Response response;
@ -407,11 +549,7 @@ TEST_CASE("Test empty tag") {
string_view(sentence_str.data() + 11, 0), // 0.3 ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.source.appendEndingWhitespace("\n");
response.target.appendSentence("", sentence.begin(), sentence.end());
response.target.appendEndingWhitespace("\n");
response.alignments = {identity_matrix<float>(4)};
html.restore(response);
@ -424,19 +562,20 @@ TEST_CASE("Test <script> element") {
std::string input(test_str);
HTML html(std::move(input), true);
CHECK(input == "hello world");
CHECK(input == "hello \n\nworld");
Response response;
std::string sentence_str("hello world");
std::string sentence_str("hello \n\nworld");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 4), // 0.0 hell
string_view(sentence_str.data() + 4, 1), // 0.1 o
string_view(sentence_str.data() + 5, 6), // 0.2 _world
string_view(sentence_str.data() + 11, 0), // 0.3 ""
string_view(sentence_str.data() + 4, 2), // 0.1 o_
string_view(sentence_str.data() + 6, 2), // 0.2 \n\n
string_view(sentence_str.data() + 8, 5), // 0.3 world
string_view(sentence_str.data() + 13, 0), // 0.4 ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.target.appendSentence("", sentence.begin(), sentence.end());
response.alignments = {identity_matrix<float>(4)};
response.alignments = {identity_matrix<float>(5)};
html.restore(response);
CHECK(response.source.text == test_str);
@ -466,10 +605,10 @@ TEST_CASE("Test comment") {
CHECK(response.target.text == test_str);
}
TEST_CASE("End-to-end translation") {
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>\n");
TEST_CASE("End-to-end translation", "[!mayfail]") {
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
HTML html(std::move(input), true);
CHECK(input == "I like to drive this car.\n");
CHECK(input == "I like to drive this car.");
Response response;
@ -500,7 +639,6 @@ TEST_CASE("End-to-end translation") {
string_view(sentence_str.data() + 25, 0), // 0.7 ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
response.source.appendEndingWhitespace("\n");
}
{
@ -517,7 +655,6 @@ TEST_CASE("End-to-end translation") {
string_view(sentence_str.data() + 28, 0), // 0.8 ""
};
response.target.appendSentence("", sentence.begin(), sentence.end());
response.target.appendEndingWhitespace("\n");
}
html.restore(response);
@ -536,27 +673,116 @@ TEST_CASE("End-to-end translation") {
string_view(sentence_str.data() + 42, 0), // 0.7 ""
};
source.appendSentence("", sentence.begin(), sentence.end());
source.appendEndingWhitespace("</p>\n");
source.appendEndingWhitespace("</p>");
CHECK(asTokens(response.source) == asTokens(source));
}
{
AnnotatedText target;
std::string sentence_str("<p>Ich <u>fahre</u> <b>gerne</b> dieses Auto.");
// Empty <b></b> because the space token after "Ich" has "<p><b>" markup, passed down from "<b>like</b>"
std::string sentence_str("<p>Ich <b></b><u>fahre</u> <b>gerne</b> dieses Auto.");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 6), // 0.0 "<p>Ich"
string_view(sentence_str.data() + 6, 4), // 0.1 " <u>"
string_view(sentence_str.data() + 10, 4), // 0.2 "fahr"
string_view(sentence_str.data() + 14, 1), // 0.3 "e"
string_view(sentence_str.data() + 15, 13), // 0.4 "</u> <b>gerne"
string_view(sentence_str.data() + 28, 11), // 0.5 "</b> dieses"
string_view(sentence_str.data() + 39, 5), // 0.6 " Auto"
string_view(sentence_str.data() + 44, 1), // 0.7 "."
string_view(sentence_str.data() + 45, 0), // 0.8 ""
string_view(sentence_str.data() + 6, 4), // 0.1 " <b>"
string_view(sentence_str.data() + 10, 11), // 0.2 "</b><u>fahr"
string_view(sentence_str.data() + 21, 1), // 0.3 "e"
string_view(sentence_str.data() + 22, 13), // 0.4 "</u> <b>gerne"
string_view(sentence_str.data() + 35, 11), // 0.5 "</b> dieses"
string_view(sentence_str.data() + 46, 5), // 0.6 " Auto"
string_view(sentence_str.data() + 51, 1), // 0.7 "."
string_view(sentence_str.data() + 52, 0), // 0.8 ""
};
target.appendSentence("", sentence.begin(), sentence.end());
target.appendEndingWhitespace("</p>\n");
target.appendEndingWhitespace("</p>");
CHECK(asTokens(response.target) == asTokens(target));
}
}
TEST_CASE("End-to-end translation when no words with markup align", "[!mayfail]") {
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
HTML html(std::move(input), true);
CHECK(input == "I like to drive this car.");
Response response;
// clang-format off
response.alignments = std::vector<std::vector<std::vector<float>>>{{
{0.5360, 0.4405, 0.0142, 0.0061, 0.0029, 0.0001, 0.0000, 0.0001},
{0.0451, 0.0602, 0.5120, 0.2584, 0.1145, 0.0062, 0.0019, 0.0017},
{0.0392, 0.0009, 0.6535, 0.2293, 0.0492, 0.0199, 0.0014, 0.0067},
{0.0007, 0.0036, 0.0112, 0.0118, 0.9209, 0.0449, 0.0050, 0.0019},
{0.0000, 0.0004, 0.0008, 0.0047, 0.0163, 0.9683, 0.0045, 0.0050},
{0.0011, 0.0046, 0.0039, 0.0090, 0.0023, 0.0024, 0.9648, 0.0119},
{0.0840, 0.0744, 0.1545, 0.1330, 0.1818, 0.1722, 0.0859, 0.1143},
}};
// clang-format on
{
std::string sentence_str("I like to drive this car.");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 1), // 0.0 "I"
string_view(sentence_str.data() + 1, 5), // 0.1 " like"
string_view(sentence_str.data() + 6, 3), // 0.2 " to"
string_view(sentence_str.data() + 9, 6), // 0.3 " drive"
string_view(sentence_str.data() + 15, 5), // 0.4 " this"
string_view(sentence_str.data() + 20, 4), // 0.5 " car"
string_view(sentence_str.data() + 24, 1), // 0.6 "."
string_view(sentence_str.data() + 25, 0), // 0.7 [EOS]
};
response.source.appendSentence("", sentence.begin(), sentence.end());
}
{
std::string sentence_str("Rád řídím to auto.");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 4), // 0.0 "Rád"
string_view(sentence_str.data() + 4, 6), // 0.1 " říd"
string_view(sentence_str.data() + 10, 3), // 0.2 "ím"
string_view(sentence_str.data() + 13, 3), // 0.3 "_to"
string_view(sentence_str.data() + 16, 5), // 0.4 " auto"
string_view(sentence_str.data() + 21, 1), // 0.5 "."
string_view(sentence_str.data() + 22, 0), // 0.6 [EOS]
};
response.target.appendSentence("", sentence.begin(), sentence.end());
}
html.restore(response);
{
AnnotatedText source;
std::string sentence_str("<p>I <b>like</b> to <u>drive</u> this car.");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 4), // 0.0 "<p>I"
string_view(sentence_str.data() + 4, 8), // 0.1 " <b>like"
string_view(sentence_str.data() + 12, 7), // 0.2 "</b> to"
string_view(sentence_str.data() + 19, 9), // 0.3 " <u>drive"
string_view(sentence_str.data() + 28, 9), // 0.4 "</u> this"
string_view(sentence_str.data() + 37, 4), // 0.5 " car"
string_view(sentence_str.data() + 41, 1), // 0.6 "."
string_view(sentence_str.data() + 42, 0), // 0.7 ""
};
source.appendSentence("", sentence.begin(), sentence.end());
source.appendEndingWhitespace("</p>");
CHECK(asTokens(response.source) == asTokens(source));
}
{
AnnotatedText target;
std::string sentence_str("<p>Rád <b></b>řídím <u></u>to auto.");
std::vector<string_view> sentence{
string_view(sentence_str.data() + 0, 7), // 0.0 "<p>Rád"
string_view(sentence_str.data() + 7, 13), // 0.1 " <b></b>říd"
string_view(sentence_str.data() + 20, 3), // 0.2 "ím"
string_view(sentence_str.data() + 23, 10), // 0.3 "_<u></u>to"
string_view(sentence_str.data() + 33, 5), // 0.4 " auto"
string_view(sentence_str.data() + 38, 1), // 0.5 "."
string_view(sentence_str.data() + 39, 0), // 0.6 [EOS]
};
target.appendSentence("", sentence.begin(), sentence.end());
target.appendEndingWhitespace("</p>");
CHECK(asTokens(response.target) == asTokens(target));
}

View File

@ -43,7 +43,7 @@ void encodeEntities(string_view const &input, std::string &output) {
size_t countPrefixWhitespaces(string_view const &input) {
size_t size = 0;
while (size < input.size() && input[size] == ' ') ++size;
while (size < input.size() && std::isspace(input[size])) ++size;
return size;
}
@ -59,6 +59,8 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
return out << "<!--" << tag->data << "-->";
case HTML::Tag::PROCESSING_INSTRUCTION:
return out << "<?" << tag->data << "?>";
case HTML::Tag::WHITESPACE:
return out << "[inserted space]";
}
return out << "[Unknown tag type]";
}
@ -107,27 +109,8 @@ class reversed {
T const &container_;
};
bool isBlockElement(std::string_view const &name) {
// List of elements that we expect might occur inside words, and that should
// not introduce spacings around them. Not strictly inline elements, nor flow
// elements. See also https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories
static std::unordered_set<std::string> inlineishElements{
"abbr", "a", "b", "em", "i", "kbd", "mark", "math", "output", "q", "ruby",
"small", "span", "strong", "sub", "sup", "time", "u", "var", "wbr", "ins", "del"};
return inlineishElements.find(std::string(name)) == inlineishElements.end();
}
bool isVoidTag(std::string_view const &name) {
// List of elements for which we do not expect a closing tag, or self-closing
// elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
// More relevant source of this list:
// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
static std::unordered_set<std::string> voidElements{"area", "base", "basefont", "bgsound", "br", "col",
"embed", "frame", "hr", "img", "input", "keygen",
"link", "meta", "param", "source", "track", "wbr"};
return voidElements.find(std::string(name)) != voidElements.end();
bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
return set.find(name) != set.end();
}
void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
@ -187,8 +170,6 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
return out;
}
bool isContinuation(string_view str) { return !str.empty() && str.compare(0, 1, " ", 1) != 0; }
bool hasAlignments(Response const &response) {
// Test for each sentence individually as a sentence may be empty (or there)
// might be no sentences, so just testing for alignments.empty() would not be
@ -207,85 +188,11 @@ bool hasAlignments(Response const &response) {
return true;
}
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
// Hard-align: find for each target token the most prevalent source token
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
size_t maxS = 0;
for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
maxS = s;
}
}
alignments.back().push_back(maxS);
}
// Next, we try to smooth out these selected alignments with a few heuristics
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
// If this token is a continuation of a previous token, pick the tags from the most
// prevalent token for the whole word.
if (isContinuation(response.target.word(sentenceIdx, t))) {
// Note: only looking at the previous token since that will already
// have this treatment applied to it.
size_t currSentenceIdx = alignments.back()[t];
size_t prevSentenceIdx = alignments.back()[t - 1];
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
if (currScore > prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
// Stop if this was the first token or the beginning of the word
if (i == 0 || !isContinuation(response.target.word(sentenceIdx, i))) break;
}
} else {
alignments.back()[t] = prevSentenceIdx;
}
}
}
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
}
}
// Internal type used to point to a position in HTML::spans_.
typedef std::vector<HTML::Span>::const_iterator SpanIterator;
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
size_t offset = 0;
// Fill targetTokenSpans based on the alignments we just made up.
// NOTE: this should match the exact order of Apply()
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
size_t s = alignments[sentenceIdx][t];
assert(s < response.source.numWords(sentenceIdx));
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
}
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
assert(offset < sourceTokenSpans.size());
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
}
// Little helper class to append HTML to a token
class TokenFormatter {
public:
explicit TokenFormatter(string_view token)
: html_(), offset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
: html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
// Do encoding of any entities that popped up in the translation
encodeEntities(token, html_);
}
@ -303,6 +210,7 @@ class TokenFormatter {
std::string closeTag = format("</{}>", tag->name);
html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
offset_ += closeTag.size();
if (closeLeft_) whitespaceOffset_ += closeTag.size();
}
for (HTML::Tag const *tag : opening) {
@ -318,17 +226,28 @@ class TokenFormatter {
case HTML::Tag::PROCESSING_INSTRUCTION:
openTag = format("<?{}?>", tag->data);
break;
case HTML::Tag::WHITESPACE: {
// Try to eat two newlines (paragraph break) from our segment
auto pos = html_.find("\n\n", whitespaceOffset_);
if (pos != std::string::npos && pos < whitespaceOffset_ + whitespaceSize_) {
html_.erase(pos, 2);
whitespaceSize_ -= 2;
}
} break;
}
html_.insert(offset_ + whitespaceSize_, openTag);
offset_ += openTag.size();
closeLeft_ = false;
closeLeft_ = closeLeft_ && openTag.empty();
}
}
private:
std::string html_; // Output html
size_t offset_; // Size added by prepending HTML
size_t whitespaceSize_; // number of prefix whitespace characters
std::string html_; // Output html
size_t offset_; // Size added by prepending HTML
size_t whitespaceOffset_; // position of prefix whitespace characters
// (it moves as closing tags are prepended)
size_t whitespaceSize_; // number of prefix whitespace characters
// Close tags we want to show up left (before) the token, but open tags
// ideally come directly after any prefix whitespace. However, some tokens
@ -339,96 +258,6 @@ class TokenFormatter {
bool closeLeft_;
};
AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
std::vector<SpanIterator> &sourceTokenSpans) {
auto spanIt = sourceSpans.begin();
auto prevIt = sourceSpans.begin(); // safe because first span is always empty span, and
// and the while-loop below will do the rest
assert(prevIt == sourceSpans.end() || prevIt->tags.empty());
return apply(in, [&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// Potential issue: spans and tokens can intersect, e.g.
//
// text <p> h <u> e </u> ll o </p>
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
// tokens |111111111111111|2|
//
// Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
// Note: only relevant if isBlockElement is used. If we just insert spaces
// around all elements, every segment of `hello` will be a token.
// Seek to the last span that overlaps with this token
while (true) {
formatter.append(prevIt->tags, spanIt->tags);
prevIt = spanIt;
if (spanIt + 1 != sourceSpans.end() && ((spanIt + 1)->begin < range.end || last)) {
spanIt++;
continue;
}
break;
}
// TODO: This is just the taint of the last span, not the ones in between.
// This makes us lose some markup of parts of tokens as described above.
sourceTokenSpans.push_back(prevIt);
return std::move(formatter.html());
});
}
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
std::vector<SpanIterator> const &targetTokenSpans) {
auto prevSpan = sourceSpans.begin();
auto targetSpanIt = targetTokenSpans.begin();
AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (auto span_it = prevSpan + 1; span_it < *targetSpanIt; span_it++) {
// We're only interested in empty spans between the spans in targetSpanIt
if (span_it->size() != 0) continue;
formatter.append(prevSpan->tags, span_it->tags);
// Note: here, not in 3rd part of for-statement because we don't want to
// set prevSpan if the continue clause at the beginning of this for-loop
// was hit.
prevSpan = span_it;
}
// Now do the same thing but for our target set of tags. Note that we cannot
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
// because there is no guarantee that the order in `targetTokenSpans` is
// the same as that of `spans`.
formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
// If this is the last token of the response, close all open tags.
if (last) {
// Note: this assert is true due to our current implementation of
// HardAlignments() that always matches the last token of the input with
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append((*targetSpanIt)->tags, HTML::Taint());
}
prevSpan = *targetSpanIt++;
return std::move(formatter.html());
});
// Assert that we did in fact use all our taints
assert(targetSpanIt == targetTokenSpans.end());
return out;
}
size_t debugCountTokens(AnnotatedText const &text) {
size_t tokens = 1; // for the ending gap
for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@ -441,8 +270,9 @@ size_t debugCountTokens(AnnotatedText const &text) {
namespace marian::bergamot {
HTML::HTML(std::string &&source, bool process_markup) {
HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
if (!process_markup) return;
std::string original = std::move(source);
markup::instream in(original.data(), original.data() + original.size());
markup::Scanner scanner(in);
@ -450,6 +280,8 @@ HTML::HTML(std::string &&source, bool process_markup) {
Tag *tag;
Taint stack;
bool addSentenceBreak = false;
bool addSpace = false;
spans_.push_back(Span{0, 0, {}});
bool stop = false;
@ -463,24 +295,41 @@ HTML::HTML(std::string &&source, bool process_markup) {
break;
case markup::Scanner::TT_TEXT: {
// If the previous segment was the open or close tag of a block element
// we treat the text after it as a new sentence.
if (addSentenceBreak) {
if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
stack.push_back(makeTag({Tag::WHITESPACE}));
// Important: span->size() == 0 to make it behave as a void element.
// Also important: position before the \n\n tokens, not after, to
// make it easier to remove them later through apply().
spans_.push_back(Span{source.size(), source.size(), stack});
source.append("\n\n"); // TODO assumes ssplit-mode = wrapped_text
stack.pop_back();
}
addSentenceBreak = false;
}
// If the previous segment was an open or close tag, it might be best
// to add a space to make sure we don't append to the previous word.
if (addSpace) {
if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
!std::isspace(scanner.value()[0])) {
source.push_back(' ');
}
addSpace = false;
}
auto begin = source.size();
source.append(scanner.value());
spans_.push_back(Span{begin, source.size(), stack});
} break;
case markup::Scanner::TT_TAG_START:
// If it makes sense to treat this element as a break in a word (e.g.
// <br>, <img>, <li>) make sure it does so in this text as well.
// TODO: Strong assumption here that the language uses spaces to
// separate words
if (isBlockElement(scanner.tag()) && !source.empty() && source.back() != ' ') source.push_back(' ');
// pool_ takes ownership of our tag, makes sure it's freed when necessary
pool_.emplace_back(new Tag{isVoidTag(scanner.tag()) ? Tag::VOID_ELEMENT : Tag::ELEMENT,
std::string(scanner.tag()), std::string()});
case markup::Scanner::TT_TAG_START: {
std::string name(scanner.tag());
// Tag *tag is used by attribute parsing
tag = pool_.back().get();
tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
stack.push_back(tag);
@ -491,7 +340,14 @@ HTML::HTML(std::string &&source, bool process_markup) {
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
}
break;
// Treat non-inline HTML tags as spaces that break up words.
if (!contains(options_.inlineTags, tag->name)) {
addSentenceBreak = true;
} else {
addSpace = true;
}
} break;
case markup::Scanner::TT_TAG_END:
// Note: self-closing tags emit TT_TAG_END immediately after TT_TAG_START
@ -508,6 +364,13 @@ HTML::HTML(std::string &&source, bool process_markup) {
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
// Add space if necessary
if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
addSentenceBreak = true;
} else {
addSpace = true;
}
break;
case markup::Scanner::TT_ATTRIBUTE:
@ -516,18 +379,16 @@ HTML::HTML(std::string &&source, bool process_markup) {
break;
case markup::Scanner::TT_COMMENT_START:
// pool_ takes ownership of our tag, makes sure it's freed when necessary
pool_.emplace_back(new Tag{Tag::COMMENT});
tag = pool_.back().get();
// Tag *tag is used when TT_DATA is seen to add the comment's content.
tag = makeTag({Tag::COMMENT});
stack.push_back(tag);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
break;
case markup::Scanner::TT_PROCESSING_INSTRUCTION_START:
// pool_ takes ownership of our tag, makes sure it's freed when necessary
pool_.emplace_back(new Tag{Tag::PROCESSING_INSTRUCTION});
tag = pool_.back().get();
// Tag *tag is used when TT_DATA is seen to add the PI's content.
tag = makeTag({Tag::PROCESSING_INSTRUCTION});
stack.push_back(tag);
spans_.push_back(Span{source.size(), source.size(), stack});
stack.pop_back();
@ -551,7 +412,7 @@ HTML::HTML(std::string &&source, bool process_markup) {
if (!stack.empty()) throw BadHTML(format("Not all tags were closed: {}", stack));
// Add a trailing span (that's empty) to signify all closed tags.
spans_.emplace_back(Span{source.size() + 1, source.size() + 1, stack});
spans_.emplace_back(Span{source.size(), source.size(), stack});
}
void HTML::restore(Response &response) {
@ -580,7 +441,7 @@ void HTML::restore(Response &response) {
// RestoreSource re-inserts HTML into the source text, but also identifies
// which span each source token fits into best.
AnnotatedText source = restoreSource(response.source, spans_, sourceTokenSpans);
AnnotatedText source = restoreSource(response.source, sourceTokenSpans);
assert(sourceTokenSpans.size() == debugCountTokens(response.source));
// Find for every token in target the token in source that best matches.
@ -591,10 +452,193 @@ void HTML::restore(Response &response) {
copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
assert(targetTokenSpans.size() == debugCountTokens(response.target));
AnnotatedText target = restoreTarget(response.target, spans_, targetTokenSpans);
AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
response.source = source;
response.target = target;
}
AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans) {
auto spanIt = spans_.begin();
auto prevIt = spans_.begin(); // safe because first span is always empty span, and
// and the while-loop below will do the rest
assert(prevIt == spans_.end() || prevIt->tags.empty());
return apply(in, [&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// Potential issue: spans and tokens can intersect, e.g.
//
// text <p> h <u> e </u> ll o </p>
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
// tokens |111111111111111|2|
//
// Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
// Note: only relevant if isBlockElement is used. If we just insert spaces
// around all elements, every segment of `hello` will be a token.
// Seek to the last span that overlaps with this token
while (true) {
formatter.append(prevIt->tags, spanIt->tags);
prevIt = spanIt;
if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
spanIt++;
continue;
}
break;
}
// TODO: This is just the taint of the last span, not the ones in between.
// This makes us lose some markup of parts of tokens as described above.
sourceTokenSpans.push_back(prevIt);
return std::move(formatter.html());
});
}
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
auto prevSpan = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
if (span_it->size() != 0 &&
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
continue;
formatter.append(prevSpan->tags, span_it->tags);
// Note: here, not in 3rd part of for-statement because we don't want to
// set prevSpan if the continue clause at the beginning of this for-loop
// was hit.
prevSpan = span_it;
}
// Now do the same thing but for our target set of tags. Note that we cannot
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
// because there is no guarantee that the order in `targetTokenSpans` is
// the same as that of `spans`.
formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
// If this is the last token of the response, close all open tags.
if (last) {
// Note: this assert is true due to our current implementation of
// HardAlignments() that always matches the last token of the input with
// the last token of the output. But lets assume someone someday changes
// HardAlignments(), and then this for-loop will be necessary.
// assert((*targetSpanIt)->tags.empty());
formatter.append((*targetSpanIt)->tags, HTML::Taint());
}
prevSpan = *targetSpanIt;
++targetSpanIt;
return std::move(formatter.html());
});
// Assert that we did in fact use all our taints
assert(targetSpanIt == targetTokenSpans.end());
return out;
}
HTML::Tag *HTML::makeTag(Tag &&tag) {
pool_.emplace_front(std::move(tag));
return &pool_.front();
}
void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
size_t offset = 0; // Sentence offset in sourceTokenSpans
// Fill targetTokenSpans based on the alignments we just made up.
// NOTE: this should match the exact order of Apply()
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
size_t s = alignments[sentenceIdx][t];
assert(s < response.source.numWords(sentenceIdx));
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
}
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
}
assert(offset + 1 == sourceTokenSpans.size());
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
}
// Reports if token `str` is likely to be a continuation of a word. This is used
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
bool HTML::isContinuation(string_view prev, string_view str) {
if (options_.continuationDelimiters.empty()) return false;
if (prev.empty() || str.empty()) return false;
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
}
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
// Hard-align: find for each target token the most prevalent source token
// Note: only search from 0 to N-1 because token N is end-of-sentence token
// that can only align with the end-of-sentence token of the target
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
size_t maxS = 0;
for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
maxS = s;
}
}
alignments.back().push_back(maxS);
}
// Next, we try to smooth out these selected alignments with a few heuristics
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
// If this token is a continuation of a previous token, pick the tags from the most
// prevalent token for the whole word.
if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
// Note: only looking at the previous token since that will already
// have this treatment applied to it.
size_t currSentenceIdx = alignments.back()[t];
size_t prevSentenceIdx = alignments.back()[t - 1];
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
if (currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
// Stop if this was the first token or the beginning of the word
if (i == 0 ||
!isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
break;
}
} else {
alignments.back()[t] = prevSentenceIdx;
}
}
}
// Always align target end with source end
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
}
}
} // namespace marian::bergamot

View File

@ -1,9 +1,12 @@
#ifndef SRC_BERGAMOT_HTML_H_
#define SRC_BERGAMOT_HTML_H_
#include <forward_list>
#include <stdexcept>
#include <string>
#include <unordered_set>
#include "annotation.h"
#include "definitions.h"
namespace marian {
@ -18,40 +21,84 @@ class BadHTML : public std::runtime_error {
class HTML {
public:
struct Options {
// List of elements for which we do not expect a closing tag, or self-closing
// elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
// More relevant source of this list:
// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
std::unordered_set<std::string> voidTags{"area", "base", "basefont", "bgsound", "br", "col",
"embed", "frame", "hr", "img", "input", "keygen",
"link", "meta", "param", "source", "track", "wbr"};
std::unordered_set<std::string> inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
"output", "q", "ruby", "small", "span", "strong", "sub", "sup",
"time", "u", "var", "wbr", "ins", "del", "img"};
// List of characters that occur at the start of a token that indicate that
// the this token is probably *not* a continuation of a word. Set to empty
// to never mark a token as a continuation of the word.
// std::string continuationDelimiters = "\n ,.(){}[]";
std::string continuationDelimiters;
// Should we always add spaces to the places where tags used to be? I.e.
// `un<u>der</u>line` should become `un der line`?
bool substituteInlineTagsWithSpaces = true;
};
struct Tag {
enum NodeType {
ELEMENT,
VOID_ELEMENT,
COMMENT,
PROCESSING_INSTRUCTION,
WHITESPACE, // negative space
};
NodeType type; // Type of the node
std::string name;
std::string attributes;
std::string data; // Raw data of an element that just needs to be
// copied as is, e.g. <script> or <style>
// TODO: replace with string_view if input lives that long
NodeType type; // Type of the node
std::string name; // Tag name (if type is ELEMENT or VOID_ELEMENT)
std::string attributes; // Tag attributes (as raw HTML string, including
// entities and prefix whitespace)
std::string data; // Raw data of an element that just needs to be
// copied as is, e.g. <script> or <style>
// @TODO: if the original HTML stays in memory, we could replace
// `attributes` and `data` with string_views pointing to it.
};
typedef std::vector<Tag *> Taint;
using Taint = std::vector<Tag *>;
struct Span {
size_t begin;
size_t end;
Taint tags; // Note: free pointer! Lifetime of tags is managed by pool_
Taint tags; // Note: free pointers! Lifetime of tags is managed by pool_
inline size_t size() const { return end - begin; }
};
explicit HTML(std::string &&source, bool process_markup);
explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
explicit HTML(std::string &&source, bool process_markup, Options &&options);
void restore(Response &response);
private:
using SpanIterator = std::vector<HTML::Span>::const_iterator;
using AnnotatedText = marian::bergamot::AnnotatedText;
AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
std::vector<HTML::SpanIterator> const &sourceTokenSpans,
std::vector<HTML::SpanIterator> &targetTokenSpans);
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
bool isContinuation(string_view prev, string_view str);
// Allocates tag in pool_ (which then owns it) and gives a pointer to be used
// in Taints. Pointer is valid as long as this HTML instance lives on.
Tag *makeTag(Tag &&tag);
Options options_;
// List of text spans, and which tags are applied to them
std::vector<Span> spans_;
// a pool of tags that we free when HTML goes out of scope
std::vector<std::unique_ptr<Tag>> pool_;
std::forward_list<Tag> pool_;
};
} // namespace bergamot