ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup

2024-10-04 10:37:08 +03:00 · 2017-03-12 18:16:53 -04:00 · 2017-03-12 18:16:53 -04:00 · 842a9141ae
commit 842a9141ae
parent 73d6446a76
10 changed files with 486 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -175,6 +175,7 @@ set(src_files
 	Sources/libMultiMarkdown/aho-corasick.c
 	Sources/libMultiMarkdown/beamer.c
 	Sources/libMultiMarkdown/char.c
+	Sources/libMultiMarkdown/critic_markup.c
 	Sources/libMultiMarkdown/d_string.c
 	Sources/libMultiMarkdown/html.c
 	Sources/libMultiMarkdown/latex.c
@ -198,6 +199,7 @@ set(header_files
 	Sources/libMultiMarkdown/aho-corasick.h
 	Sources/libMultiMarkdown/beamer.h
 	Sources/libMultiMarkdown/char.h
+	Sources/libMultiMarkdown/critic_markup.h
 	Sources/libMultiMarkdown/include/d_string.h
 	Sources/libMultiMarkdown/html.h
 	Sources/libMultiMarkdown/latex.h
@ -569,6 +571,10 @@ ADD_MMD_TEST(mmd-6-latex "-t latex" MMD6Tests tex)

 ADD_MMD_TEST(mmd-6-odf "-t odf" MMD6Tests fodt)

+ADD_MMD_TEST(mmd-6-critic-accept "-a" CriticMarkup htmla)
+
+ADD_MMD_TEST(mmd-6-critic-reject "-r" CriticMarkup htmlr)
+
 ADD_MMD_TEST(pathologic-compat "-c" ../build html)

 ADD_MMD_TEST(pathologic "" ../build html)
--- a/Sources/libMultiMarkdown/aho-corasick.c
+++ b/Sources/libMultiMarkdown/aho-corasick.c
@ -361,7 +361,7 @@ match * match_add(match * last, size_t start, size_t len, unsigned short match_t
 }


-match * ac_trie_search(trie * a, const char * source, size_t len) {
+match * ac_trie_search(trie * a, const char * source, size_t start, size_t len) {

 	// Store results in a linked list
 //	match * result = match_new(0, 0, 0);
@ -374,9 +374,10 @@ match * ac_trie_search(trie * a, const char * source, size_t len) {

 	// Character being compared
 	int test_value;
-	size_t counter = 0;
+	size_t counter = start;
+	size_t stop = start + len;

-	while ((counter < len) && (source[counter] != '\0')) {
+	while ((counter < stop) && (source[counter] != '\0')) {
 		// Read next character
 		test_value = (int)source[counter++];

@ -494,8 +495,8 @@ void match_set_filter_leftmost_longest(match * header) {
 }


-match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len) {
-	match * result = ac_trie_search(a, source, len);
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t start, size_t len) {
+	match * result = ac_trie_search(a, source, start, len);

 	if (result)
 		match_set_filter_leftmost_longest(result);
@ -535,12 +536,12 @@ void Test_aho_trie_search(CuTest* tc) {

 	ac_trie_prepare(a);

-	m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+	m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
 	fprintf(stderr, "Finish with %d matches\n", match_count(m));
 	match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
 	match_free(m);

-	m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 26);
+	m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
 	fprintf(stderr, "Finish with %d matches\n", match_count(m));
 	match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
 	match_free(m);
--- a/Sources/libMultiMarkdown/aho-corasick.h
+++ b/Sources/libMultiMarkdown/aho-corasick.h
@ -96,9 +96,9 @@ bool trie_insert(trie * a, const char * key, unsigned short match_type);

 void ac_trie_prepare(trie * a);

-match * ac_trie_search(trie * a, const char * source, size_t len);
+match * ac_trie_search(trie * a, const char * source, size_t start, size_t len);

-match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t len);
+match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t start, size_t len);

 void trie_free(trie * a);

@ -109,6 +109,9 @@ void match_set_filter_leftmost_longest(match * header);
 void match_free(match * m);


+void trie_to_graphviz(trie * a);
+
+
 #ifdef TEST
 #include "CuTest.h"
 #endif
--- a/Sources/libMultiMarkdown/critic_markup.c
+++ b/Sources/libMultiMarkdown/critic_markup.c
@ -0,0 +1,288 @@
+/**
+
+	MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
+
+	@file critic_markup.c
+
+	@brief 
+
+
+	@author	Fletcher T. Penney
+	@bug	
+
+**/
+
+/*
+
+	Copyright © 2016 - 2017 Fletcher T. Penney.
+
+
+	The `MultiMarkdown 6` project is released under the MIT License..
+	
+	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+	
+		https://github.com/fletcher/MultiMarkdown-4/
+	
+	MMD 4 is released under both the MIT License and GPL.
+	
+	
+	CuTest is released under the zlib/libpng license. See CuTest.c for the
+	text of the license.
+	
+	
+	## The MIT License ##
+	
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the
+	"Software"), to deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to
+	permit persons to whom the Software is furnished to do so, subject to
+	the following conditions:
+	
+	The above copyright notice and this permission notice shall be
+	included in all copies or substantial portions of the Software.
+	
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+
+*/
+
+#include <stdbool.h>
+#include <string.h>
+
+
+#include "aho-corasick.h"
+#include "critic_markup.h"
+#include "stack.h"
+#include "token_pairs.h"
+
+
+token * critic_tokenize_string(const char * source, size_t start, size_t len) {
+	trie * ac = trie_new(0);
+
+	trie_insert(ac, "{++", CM_ADD_OPEN);
+	trie_insert(ac, "++}", CM_ADD_CLOSE);
+
+	trie_insert(ac, "{--", CM_DEL_OPEN);
+	trie_insert(ac, "--}", CM_DEL_CLOSE);
+
+	trie_insert(ac, "{~~", CM_SUB_OPEN);
+	trie_insert(ac, "~>", CM_SUB_DIV);
+	trie_insert(ac, "~~}", CM_SUB_CLOSE);
+
+	trie_insert(ac, "{==", CM_HI_OPEN);
+	trie_insert(ac, "==}", CM_HI_CLOSE);
+
+	trie_insert(ac, "{>>", CM_COM_OPEN);
+	trie_insert(ac, "<<}", CM_COM_CLOSE);
+
+	ac_trie_prepare(ac);
+
+	match * m = ac_trie_leftmost_longest_search(ac, source, start, len);
+
+	token * root = NULL;
+
+	if (m) {
+		match * walker = m->next;
+
+		root = token_new(0, 0, 0);
+
+		size_t last = start;
+
+		while (walker) {
+			if (walker->start > last) {
+				token_append_child(root, token_new(CM_PLAIN_TEXT, last, walker->start - last));
+				last = walker->start;
+			}
+
+			if (walker->start == last) {
+				token_append_child(root, token_new(walker->match_type, walker->start, walker->len));
+				last = walker->start + walker->len;
+			}
+
+			walker = walker->next;
+		}
+
+		if (last < start + len) {
+			token_append_child(root, token_new(CM_PLAIN_TEXT, last, start + len));
+		}
+
+		match_free(m);
+		trie_free(ac);
+	}
+
+	return root;
+}
+
+
+
+token * critic_parse_substring(const char * source, size_t start, size_t len) {
+	token * chain = critic_tokenize_string(source, start, len);
+
+	if (chain) {
+		token_pair_engine * e = token_pair_engine_new();
+
+		token_pair_engine_add_pairing(e, CM_ADD_OPEN, CM_ADD_CLOSE, CM_ADD_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
+		token_pair_engine_add_pairing(e, CM_DEL_OPEN, CM_DEL_CLOSE, CM_DEL_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
+		token_pair_engine_add_pairing(e, CM_SUB_OPEN, CM_SUB_CLOSE, CM_SUB_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
+		token_pair_engine_add_pairing(e, CM_HI_OPEN,  CM_HI_CLOSE,  CM_HI_PAIR,  PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
+		token_pair_engine_add_pairing(e, CM_COM_OPEN, CM_COM_CLOSE, CM_COM_PAIR, PAIRING_ALLOW_EMPTY | PAIRING_PRUNE_MATCH);
+
+		stack * s = stack_new(0);
+
+		token_pairs_match_pairs_inside_token(chain, e, s, 0);
+
+		stack_free(s);
+		token_pair_engine_free(e);
+	}
+
+	return chain;
+}
+
+
+void accept_token_tree(DString * d, token * t);
+void accept_token(DString * d, token * t);
+
+
+void accept_token_tree_sub(DString * d, token * t) {
+	while (t) {
+		if (t->type == CM_SUB_DIV) {
+			while (t) {
+				d_string_erase(d, t->start, t->len);
+				t = t->prev;
+			}
+
+			return;
+		}
+
+		accept_token(d, t);
+
+		t = t->prev;
+	}
+}
+
+
+void accept_token(DString * d, token * t) {
+	switch (t->type) {
+		case CM_SUB_CLOSE:
+			if (t->mate) {
+				d_string_erase(d, t->start, t->len);
+			}
+			break;
+		case CM_SUB_OPEN:
+		case CM_ADD_OPEN:
+		case CM_ADD_CLOSE:
+			if (!t->mate)
+				break;
+		case CM_SUB_DIV:
+		case CM_DEL_PAIR:
+		case CM_COM_PAIR:
+		case CM_HI_PAIR:
+			// Erase these
+			d_string_erase(d, t->start, t->len);
+			break;
+		case CM_SUB_PAIR:
+			// Erase old version and markers
+			accept_token_tree_sub(d, t->child->mate);
+			break;
+		case CM_ADD_PAIR:
+			// Check children
+			accept_token_tree(d, t->child->mate);
+			break;
+	}
+}
+
+
+void accept_token_tree(DString * d, token * t) {
+	while (t) {
+		accept_token(d, t);
+
+		// Iterate backwards so offsets are right
+		t = t->prev;
+	}
+}
+
+void critic_markup_accept(DString * d) {
+	token * t = critic_parse_substring(d->str, 0, d->currentStringLength);
+
+	accept_token_tree(d, t->child->tail);
+
+	token_free(t);
+}
+
+
+void reject_token_tree(DString * d, token * t);
+void reject_token(DString * d, token * t);
+
+
+void reject_token_tree_sub(DString * d, token * t) {
+	while (t && t->type != CM_SUB_DIV) {
+		d_string_erase(d, t->start, t->len);
+		t = t->prev;
+	}
+
+	while (t) {
+
+		reject_token(d, t);
+
+		t = t->prev;
+	}
+}
+
+
+void reject_token(DString * d, token * t) {
+	switch (t->type) {
+		case CM_SUB_CLOSE:
+			if (t->mate) {
+				d_string_erase(d, t->start, t->len);
+			}
+			break;
+		case CM_SUB_OPEN:
+		case CM_DEL_OPEN:
+		case CM_DEL_CLOSE:
+			if (!t->mate)
+				break;
+		case CM_SUB_DIV:
+		case CM_ADD_PAIR:
+		case CM_COM_PAIR:
+		case CM_HI_PAIR:
+			// Erase these
+			d_string_erase(d, t->start, t->len);
+			break;
+		case CM_SUB_PAIR:
+			// Erase new version and markers
+			reject_token_tree_sub(d, t->child->mate);
+			break;
+		case CM_DEL_PAIR:
+			// Check children
+			reject_token_tree(d, t->child->mate);
+			break;
+	}
+}
+
+
+void reject_token_tree(DString * d, token * t) {
+	while (t) {
+		reject_token(d, t);
+
+		// Iterate backwards so offsets are right
+		t = t->prev;
+	}
+}
+
+void critic_markup_reject(DString * d) {
+	token * t = critic_parse_substring(d->str, 0, d->currentStringLength);
+
+	reject_token_tree(d, t->child->tail);
+
+	token_free(t);
+
+}
+
--- a/Sources/libMultiMarkdown/critic_markup.h
+++ b/Sources/libMultiMarkdown/critic_markup.h
@ -0,0 +1,94 @@
+/**
+
+	MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
+
+	@file critic_markup.h
+
+	@brief 
+
+
+	@author	Fletcher T. Penney
+	@bug	
+
+**/
+
+/*
+
+	Copyright © 2016 - 2017 Fletcher T. Penney.
+
+
+	The `MultiMarkdown 6` project is released under the MIT License..
+	
+	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
+	
+		https://github.com/fletcher/MultiMarkdown-4/
+	
+	MMD 4 is released under both the MIT License and GPL.
+	
+	
+	CuTest is released under the zlib/libpng license. See CuTest.c for the
+	text of the license.
+	
+	
+	## The MIT License ##
+	
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the
+	"Software"), to deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to
+	permit persons to whom the Software is furnished to do so, subject to
+	the following conditions:
+	
+	The above copyright notice and this permission notice shall be
+	included in all copies or substantial portions of the Software.
+	
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+
+*/
+
+
+#ifndef CRITIC_MARKUP_MULTIMARKDOWN_H
+#define CRITIC_MARKUP_MULTIMARKDOWN_H
+
+#include "d_string.h"
+
+enum cm_types {
+	CM_ADD_OPEN = 1,		// Can't use type 0
+	CM_ADD_CLOSE,
+
+	CM_DEL_OPEN,
+	CM_DEL_CLOSE,
+
+	CM_SUB_OPEN,
+	CM_SUB_DIV,
+	CM_SUB_CLOSE,
+
+	CM_HI_OPEN,
+	CM_HI_CLOSE,
+
+	CM_COM_OPEN,
+	CM_COM_CLOSE,
+
+	CM_ADD_PAIR,
+	CM_DEL_PAIR,
+	CM_SUB_PAIR,
+	CM_HI_PAIR, 
+	CM_COM_PAIR,
+
+	CM_PLAIN_TEXT
+};
+
+
+void critic_markup_accept(DString * d);
+
+void critic_markup_reject(DString * d);
+
+#endif
--- a/Sources/libMultiMarkdown/writer.c
+++ b/Sources/libMultiMarkdown/writer.c
@ -1486,7 +1486,7 @@ void process_metadata_stack(mmd_engine * e, scratch_pad * scratch) {


 void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
-	match * m = ac_trie_leftmost_longest_search(ac, &e->dstr->str[t->start], t->len);
+	match * m = ac_trie_leftmost_longest_search(ac, e->dstr->str, t->start, t->len);

 	match * walker;

@ -1496,7 +1496,7 @@ void automatic_search_text(mmd_engine * e, token * t, trie * ac) {
 		walker = m->next;

 		while (walker) {
-			token_split(tok, walker->start + t->start, walker->len, walker->match_type);
+			token_split(tok, walker->start, walker->len, walker->match_type);

 			// Advance token to section after the split (if present)
 			tok = tok->next->next;
--- a/Sources/multimarkdown/main.c
+++ b/Sources/multimarkdown/main.c
@ -61,6 +61,7 @@


 #include "argtable3.h"
+#include "critic_markup.h"
 #include "d_string.h"
 #include "i18n.h"
 #include "libMultiMarkdown.h"
@ -73,7 +74,8 @@
 #define kBUFFERSIZE 4096	// How many bytes to read at a time

 // argtable structs
-struct arg_lit *a_help, *a_version, *a_compatibility, *a_nolabels, *a_batch, *a_accept, *a_reject, *a_full, *a_snippet;
+struct arg_lit *a_help, *a_version, *a_compatibility, *a_nolabels, *a_batch,
+		*a_accept, *a_reject, *a_full, *a_snippet;
 struct arg_str *a_format, *a_lang;
 struct arg_file *a_file, *a_o;
 struct arg_end *a_end;
@ -349,6 +351,15 @@ int main(int argc, char** argv) {
 				// Don't free folder -- owned by dirname
 			}

+			// Perform block level CriticMarkup?
+			if (extensions & EXT_CRITIC_ACCEPT) {
+				critic_markup_accept(buffer);
+			}
+
+			if (extensions & EXT_CRITIC_REJECT) {
+				critic_markup_reject(buffer);
+			}
+
 			// Increment counter and prepare token pool
 #ifdef kUseObjectPool
 			token_pool_init();
@ -412,6 +423,15 @@ int main(int argc, char** argv) {
 			// Don't free folder -- owned by dirname
 		}

+		// Perform block level CriticMarkup?
+		if (extensions & EXT_CRITIC_ACCEPT) {
+			critic_markup_accept(buffer);
+		}
+
+		if (extensions & EXT_CRITIC_REJECT) {
+			critic_markup_reject(buffer);
+		}
+
 		if (FORMAT_MMD == format) {
 			result = buffer->str;
 		} else {
--- a/tests/CriticMarkup/CriticMarkup.htmla
+++ b/tests/CriticMarkup/CriticMarkup.htmla
@ -0,0 +1,23 @@
+<!DOCTYPE html>
+<html>
+<head>
+	<meta charset="utf-8"/>
+	<title>Extended CriticMarkup</title>
+</head>
+<body>
+
+<p>This is a <em>single</em> paragraph</p>
+
+<p>that was split in two.</p>
+
+<p>This is <em>two</em> paragraphs joined together.</p>
+
+<p>This is two paragraphs</p>
+
+<p>With a <em>new</em> paragraph inserted</p>
+
+<p>between them.</p>
+
+</body>
+</html>
+
--- a/tests/CriticMarkup/CriticMarkup.htmlr
+++ b/tests/CriticMarkup/CriticMarkup.htmlr
@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html>
+<head>
+	<meta charset="utf-8"/>
+	<title>Extended CriticMarkup</title>
+</head>
+<body>
+
+<p>This is a <em>single</em> paragraph that was split in two.</p>
+
+<p>This is <em>two</em> paragraphs</p>
+
+<p>joined together.</p>
+
+<p>This is two paragraphs</p>
+
+<p>with nothing between them.</p>
+
+</body>
+</html>
+
--- a/tests/CriticMarkup/CriticMarkup.text
+++ b/tests/CriticMarkup/CriticMarkup.text
@ -0,0 +1,18 @@
+Title:	Extended CriticMarkup
+latex config:	article
+
+This is a *single* paragraph {++
+
++}that was split in two.
+
+This is *two* paragraphs {--
+
+--}joined together.
+
+This is two paragraphs{~~
+
+with nothing ~>
+
+With a *new* paragraph inserted
+
+~~}between them.