ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup

This commit is contained in:
Fletcher T. Penney 2017-03-12 18:16:53 -04:00
parent 73d6446a76
commit 842a9141ae
10 changed files with 486 additions and 12 deletions

View File

@ -175,6 +175,7 @@ set(src_files
Sources/libMultiMarkdown/aho-corasick.c
Sources/libMultiMarkdown/beamer.c
Sources/libMultiMarkdown/char.c
Sources/libMultiMarkdown/critic_markup.c
Sources/libMultiMarkdown/d_string.c
Sources/libMultiMarkdown/html.c
Sources/libMultiMarkdown/latex.c
@ -198,6 +199,7 @@ set(header_files
Sources/libMultiMarkdown/aho-corasick.h
Sources/libMultiMarkdown/beamer.h
Sources/libMultiMarkdown/char.h
Sources/libMultiMarkdown/critic_markup.h
Sources/libMultiMarkdown/include/d_string.h
Sources/libMultiMarkdown/html.h
Sources/libMultiMarkdown/latex.h
@ -569,6 +571,10 @@ ADD_MMD_TEST(mmd-6-latex "-t latex" MMD6Tests tex)
ADD_MMD_TEST(mmd-6-odf "-t odf" MMD6Tests fodt)
ADD_MMD_TEST(mmd-6-critic-accept "-a" CriticMarkup htmla)
ADD_MMD_TEST(mmd-6-critic-reject "-r" CriticMarkup htmlr)
ADD_MMD_TEST(pathologic-compat "-c" ../build html)
ADD_MMD_TEST(pathologic "" ../build html)

View File

@ -361,7 +361,7 @@ match * match_add(match * last, size_t start, size_t len, unsigned short match_t
}
match * ac_trie_search(trie * a, const char * source, size_t len) {
match * ac_trie_search(trie * a, const char * source, size_t start, size_t len) {
// Store results in a linked list
// match * result = match_new(0, 0, 0);
@ -374,9 +374,10 @@ match * ac_trie_search(trie * a, const char * source, size_t len) {
// Character being compared
int test_value;
size_t counter = 0;
size_t counter = start;
size_t stop = start + len;
while ((counter < len) && (source[counter] != '\0')) {
while ((counter < stop) && (source[counter] != '\0')) {
// Read next character
test_value = (int)source[counter++];
@ -494,8 +495,8 @@ void match_set_filter_leftmost_longest(match * header) {
}
match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len) {
match * result = ac_trie_search(a, source, len);
match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t start, size_t len) {
match * result = ac_trie_search(a, source, start, len);
if (result)
match_set_filter_leftmost_longest(result);
@ -535,12 +536,12 @@ void Test_aho_trie_search(CuTest* tc) {
ac_trie_prepare(a);
m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
fprintf(stderr, "Finish with %d matches\n", match_count(m));
match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
match_free(m);
m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
fprintf(stderr, "Finish with %d matches\n", match_count(m));
match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
match_free(m);

View File

@ -96,9 +96,9 @@ bool trie_insert(trie * a, const char * key, unsigned short match_type);
void ac_trie_prepare(trie * a);
match * ac_trie_search(trie * a, const char * source, size_t len);
match * ac_trie_search(trie * a, const char * source, size_t start, size_t len);
match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len);
match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t start, size_t len);
void trie_free(trie * a);
@ -109,6 +109,9 @@ void match_set_filter_leftmost_longest(match * header);
void match_free(match * m);
void trie_to_graphviz(trie * a);
#ifdef TEST
#include "CuTest.h"
#endif

View File

@ -0,0 +1,288 @@
/**
MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
@file critic_markup.c
@brief
@author Fletcher T. Penney
@bug
**/
/*
Copyright © 2016 - 2017 Fletcher T. Penney.
The `MultiMarkdown 6` project is released under the MIT License..
GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
https://github.com/fletcher/MultiMarkdown-4/
MMD 4 is released under both the MIT License and GPL.
CuTest is released under the zlib/libpng license. See CuTest.c for the
text of the license.
## The MIT License ##
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <stdbool.h>
#include <string.h>
#include "aho-corasick.h"
#include "critic_markup.h"
#include "stack.h"
#include "token_pairs.h"
token * critic_tokenize_string(const char * source, size_t start, size_t len) {
trie * ac = trie_new(0);
trie_insert(ac, "{++", CM_ADD_OPEN);
trie_insert(ac, "++}", CM_ADD_CLOSE);
trie_insert(ac, "{--", CM_DEL_OPEN);
trie_insert(ac, "--}", CM_DEL_CLOSE);
trie_insert(ac, "{~~", CM_SUB_OPEN);
trie_insert(ac, "~>", CM_SUB_DIV);
trie_insert(ac, "~~}", CM_SUB_CLOSE);
trie_insert(ac, "{==", CM_HI_OPEN);
trie_insert(ac, "==}", CM_HI_CLOSE);
trie_insert(ac, "{>>", CM_COM_OPEN);
trie_insert(ac, "<<}", CM_COM_CLOSE);
ac_trie_prepare(ac);
match * m = ac_trie_leftmost_longest_search(ac, source, start, len);
token * root = NULL;
if (m) {
match * walker = m->next;
root = token_new(0, 0, 0);
size_t last = start;
while (walker) {
if (walker->start > last) {
token_append_child(root, token_new(CM_PLAIN_TEXT, last, walker->start - last));
last = walker->start;
}
if (walker->start == last) {
token_append_child(root, token_new(walker->match_type, walker->start, walker->len));
last = walker->start + walker->len;
}
walker = walker->next;
}
if (last < start + len) {
token_append_child(root, token_new(CM_PLAIN_TEXT, last, start + len));
}
match_free(m);
trie_free(ac);
}
return root;
}
token * critic_parse_substring(const char * source, size_t start, size_t len) {
token * chain = critic_tokenize_string(source, start, len);
if (chain) {
token_pair_engine * e = token_pair_engine_new();
token_pair_engine_add_pairing(e, CM_ADD_OPEN, CM_ADD_CLOSE, CM_ADD_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
token_pair_engine_add_pairing(e, CM_DEL_OPEN, CM_DEL_CLOSE, CM_DEL_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
token_pair_engine_add_pairing(e, CM_SUB_OPEN, CM_SUB_CLOSE, CM_SUB_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
token_pair_engine_add_pairing(e, CM_HI_OPEN, CM_HI_CLOSE, CM_HI_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
token_pair_engine_add_pairing(e, CM_COM_OPEN, CM_COM_CLOSE, CM_COM_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
stack * s = stack_new(0);
token_pairs_match_pairs_inside_token(chain, e, s, 0);
stack_free(s);
token_pair_engine_free(e);
}
return chain;
}
void accept_token_tree(DString * d, token * t);
void accept_token(DString * d, token * t);
void accept_token_tree_sub(DString * d, token * t) {
while (t) {
if (t->type == CM_SUB_DIV) {
while (t) {
d_string_erase(d, t->start, t->len);
t = t->prev;
}
return;
}
accept_token(d, t);
t = t->prev;
}
}
void accept_token(DString * d, token * t) {
switch (t->type) {
case CM_SUB_CLOSE:
if (t->mate) {
d_string_erase(d, t->start, t->len);
}
break;
case CM_SUB_OPEN:
case CM_ADD_OPEN:
case CM_ADD_CLOSE:
if (!t->mate)
break;
case CM_SUB_DIV:
case CM_DEL_PAIR:
case CM_COM_PAIR:
case CM_HI_PAIR:
// Erase these
d_string_erase(d, t->start, t->len);
break;
case CM_SUB_PAIR:
// Erase old version and markers
accept_token_tree_sub(d, t->child->mate);
break;
case CM_ADD_PAIR:
// Check children
accept_token_tree(d, t->child->mate);
break;
}
}
void accept_token_tree(DString * d, token * t) {
while (t) {
accept_token(d, t);
// Iterate backwards so offsets are right
t = t->prev;
}
}
void critic_markup_accept(DString * d) {
token * t = critic_parse_substring(d->str, 0, d->currentStringLength);
accept_token_tree(d, t->child->tail);
token_free(t);
}
void reject_token_tree(DString * d, token * t);
void reject_token(DString * d, token * t);
void reject_token_tree_sub(DString * d, token * t) {
while (t && t->type != CM_SUB_DIV) {
d_string_erase(d, t->start, t->len);
t = t->prev;
}
while (t) {
reject_token(d, t);
t = t->prev;
}
}
void reject_token(DString * d, token * t) {
switch (t->type) {
case CM_SUB_CLOSE:
if (t->mate) {
d_string_erase(d, t->start, t->len);
}
break;
case CM_SUB_OPEN:
case CM_DEL_OPEN:
case CM_DEL_CLOSE:
if (!t->mate)
break;
case CM_SUB_DIV:
case CM_ADD_PAIR:
case CM_COM_PAIR:
case CM_HI_PAIR:
// Erase these
d_string_erase(d, t->start, t->len);
break;
case CM_SUB_PAIR:
// Erase new version and markers
reject_token_tree_sub(d, t->child->mate);
break;
case CM_DEL_PAIR:
// Check children
reject_token_tree(d, t->child->mate);
break;
}
}
void reject_token_tree(DString * d, token * t) {
while (t) {
reject_token(d, t);
// Iterate backwards so offsets are right
t = t->prev;
}
}
void critic_markup_reject(DString * d) {
token * t = critic_parse_substring(d->str, 0, d->currentStringLength);
reject_token_tree(d, t->child->tail);
token_free(t);
}

View File

@ -0,0 +1,94 @@
/**
MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
@file critic_markup.h
@brief
@author Fletcher T. Penney
@bug
**/
/*
Copyright © 2016 - 2017 Fletcher T. Penney.
The `MultiMarkdown 6` project is released under the MIT License..
GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
https://github.com/fletcher/MultiMarkdown-4/
MMD 4 is released under both the MIT License and GPL.
CuTest is released under the zlib/libpng license. See CuTest.c for the
text of the license.
## The MIT License ##
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef CRITIC_MARKUP_MULTIMARKDOWN_H
#define CRITIC_MARKUP_MULTIMARKDOWN_H
#include "d_string.h"
enum cm_types {
CM_ADD_OPEN = 1, // Can't use type 0
CM_ADD_CLOSE,
CM_DEL_OPEN,
CM_DEL_CLOSE,
CM_SUB_OPEN,
CM_SUB_DIV,
CM_SUB_CLOSE,
CM_HI_OPEN,
CM_HI_CLOSE,
CM_COM_OPEN,
CM_COM_CLOSE,
CM_ADD_PAIR,
CM_DEL_PAIR,
CM_SUB_PAIR,
CM_HI_PAIR,
CM_COM_PAIR,
CM_PLAIN_TEXT
};
void critic_markup_accept(DString * d);
void critic_markup_reject(DString * d);
#endif

View File

@ -1486,7 +1486,7 @@ void process_metadata_stack(mmd_engine * e, scratch_pad * scratch) {
void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
match * m = ac_trie_leftmost_longest_search(ac, &e->dstr->str[t->start], t->len);
match * m = ac_trie_leftmost_longest_search(ac, e->dstr->str, t->start, t->len);
match * walker;
@ -1496,7 +1496,7 @@ void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
walker = m->next;
while (walker) {
token_split(tok, walker->start + t->start, walker->len, walker->match_type);
token_split(tok, walker->start, walker->len, walker->match_type);
// Advance token to section after the split (if present)
tok = tok->next->next;

View File

@ -61,6 +61,7 @@
#include "argtable3.h"
#include "critic_markup.h"
#include "d_string.h"
#include "i18n.h"
#include "libMultiMarkdown.h"
@ -73,7 +74,8 @@
#define kBUFFERSIZE 4096 // How many bytes to read at a time
// argtable structs
struct arg_lit *a_help, *a_version, *a_compatibility, *a_nolabels, *a_batch, *a_accept, *a_reject, *a_full, *a_snippet;
struct arg_lit *a_help, *a_version, *a_compatibility, *a_nolabels, *a_batch,
*a_accept, *a_reject, *a_full, *a_snippet;
struct arg_str *a_format, *a_lang;
struct arg_file *a_file, *a_o;
struct arg_end *a_end;
@ -349,6 +351,15 @@ int main(int argc, char** argv) {
// Don't free folder -- owned by dirname
}
// Perform block level CriticMarkup?
if (extensions & EXT_CRITIC_ACCEPT) {
critic_markup_accept(buffer);
}
if (extensions & EXT_CRITIC_REJECT) {
critic_markup_reject(buffer);
}
// Increment counter and prepare token pool
#ifdef kUseObjectPool
token_pool_init();
@ -412,6 +423,15 @@ int main(int argc, char** argv) {
// Don't free folder -- owned by dirname
}
// Perform block level CriticMarkup?
if (extensions & EXT_CRITIC_ACCEPT) {
critic_markup_accept(buffer);
}
if (extensions & EXT_CRITIC_REJECT) {
critic_markup_reject(buffer);
}
if (FORMAT_MMD == format) {
result = buffer->str;
} else {

View File

@ -0,0 +1,23 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Extended CriticMarkup</title>
</head>
<body>
<p>This is a <em>single</em> paragraph</p>
<p>that was split in two.</p>
<p>This is <em>two</em> paragraphs joined together.</p>
<p>This is two paragraphs</p>
<p>With a <em>new</em> paragraph inserted</p>
<p>between them.</p>
</body>
</html>

View File

@ -0,0 +1,21 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Extended CriticMarkup</title>
</head>
<body>
<p>This is a <em>single</em> paragraph that was split in two.</p>
<p>This is <em>two</em> paragraphs</p>
<p>joined together.</p>
<p>This is two paragraphs</p>
<p>with nothing between them.</p>
</body>
</html>

View File

@ -0,0 +1,18 @@
Title: Extended CriticMarkup
latex config: article
This is a *single* paragraph {++
++}that was split in two.
This is *two* paragraphs {--
--}joined together.
This is two paragraphs{~~
with nothing ~>
With a *new* paragraph inserted
~~}between them.