some more iterations on data extraction

2024-10-26 13:16:11 +03:00 · 2011-06-14 07:21:53 -07:00 · 2011-06-14 07:21:53 -07:00 · da500ce393
commit da500ce393
parent cd8f0bfddb
5 changed files with 240 additions and 20 deletions
--- a/src/data_parser.cc
+++ b/src/data_parser.cc
@ -14,19 +14,116 @@ static data_token_t UPTO_SEPARATOR[] = {
    DT_LINE,
 };

+static data_token_t PATTERN_PAIR[] = {
+    DNT_ROW,
+    DT_SEPARATOR,
+    DNT_KEY,
+};
+
+static data_token_t PATTERN_AGGREGATE[] = {
+    DT_ANY,
+    DT_COMMA,
+    DNT_AGGREGATE,
+};
+
+void data_parser::reduceAggregate(void)
+{
+    std::list<element> reduction;
+
+    if (this->reducePattern(reduction,
+			    PATTERN_AGGREGATE,
+			    PATTERN_AGGREGATE +
+			    sizeof(PATTERN_AGGREGATE) / sizeof(data_token_t))) {
+	struct element &top = this->dp_stack.front();
+	
+	this->dp_stack.push_front(element(reduction, DNT_AGGREGATE));
+	top.assign_elements(*reduction.front().e_sub_elements);
+	if (reduction.back().e_sub_elements != NULL)
+	    top.assign_elements(*reduction.back().e_sub_elements);
+	else
+	    top.e_sub_elements->push_back(reduction.back());
+    }
+}
+
+void data_parser::reducePair(void)
+{
+    std::list<element> reduction;
+
+    this->reduceAggregate();
+    if (this->reduceUpTo(reduction,
+			 UPTO_SEPARATOR,
+			 UPTO_SEPARATOR +
+			 sizeof(UPTO_SEPARATOR) / sizeof(data_token_t))) {
+	this->dp_stack.push_front(element(reduction, DNT_ROW));
+	this->dp_stack.front().assign_elements(reduction);
+    }
+    
+    if (this->reducePattern(reduction,
+			    PATTERN_PAIR,
+			    PATTERN_PAIR +
+			    sizeof(PATTERN_PAIR) / sizeof(data_token_t))) {
+	std::list<element>::iterator middle = reduction.begin();
+	
+	++middle;
+	reduction.erase(middle);
+	this->dp_stack.push_front(element(reduction, DNT_PAIR));
+	this->dp_stack.front().assign_elements(reduction);
+    }
+}
+
 void data_parser::reduce(const element &lookahead)
 {
-    struct element &top_elem = this->dp_stack.front();
+    std::list<element> reduction;
+    bool push_lookahead = true;

    switch (lookahead.e_token) {
    case DT_INVALID:
+    case DT_WHITE:
+	push_lookahead = false;
 	break;

-    case DT_WHITE:
+    case DT_LINE:
+	this->reducePair();
+	push_lookahead = false;
+	break;
+
+    case DT_COMMA:
+	this->reduceAggregate();
+	if (!this->dp_stack.empty() &&
+	    this->dp_stack.front().e_token != DNT_AGGREGATE) {
+	    if (this->dp_stack.front().e_token == DT_SEPARATOR) {
+		push_lookahead = false;
+	    }
+	    else {
+		std::list<element>::iterator next_elem = this->dp_stack.begin();
+
+		advance(next_elem, 1);
+		reduction.splice(reduction.end(),
+				 this->dp_stack,
+				 this->dp_stack.begin(),
+				 next_elem);
+		this->dp_stack.push_front(element(reduction, DNT_AGGREGATE));
+		this->dp_stack.front().assign_elements(reduction);
+	    }
+	}
 	break;
 	
    case DT_SEPARATOR:
-	
+	if (this->reducePattern(reduction,
+				PATTERN_KEY,
+				PATTERN_KEY +
+				sizeof(PATTERN_KEY) / sizeof(data_token_t),
+				true)) {
+	    this->reducePair();
+	    this->dp_stack.push_front(element(reduction, DNT_KEY));
+	}
 	break;
    }
+
+    if (push_lookahead) {
+	this->dp_stack.push_front(lookahead);
+    }
+    
+    // this->print();
+    printf("----\n");
 }
--- a/src/data_parser.hh
+++ b/src/data_parser.hh
@ -12,15 +12,46 @@ class data_parser {

 public:
    struct element {
-	element() : e_token(DT_INVALID) { };
+	element() : e_token(DT_INVALID), e_sub_elements(NULL) { };
+	element(std::list<element> &subs, data_token_t token)
+	    : e_capture(subs.front().e_capture.c_begin,
+			subs.back().e_capture.c_end),
+	      e_token(token),
+	      e_sub_elements(NULL) {
+	};
+	
+	element(const element &other) {
+	    assert(other.e_sub_elements == NULL);
+
+	    this->e_capture = other.e_capture;
+	    this->e_token = other.e_token;
+	};
+
+	~element() {
+	    if (this->e_sub_elements != NULL) {
+		delete this->e_sub_elements;
+		this->e_sub_elements = NULL;
+	    }
+	};
+
+	void assign_elements(std::list<element> &subs) {
+	    this->e_sub_elements = new std::list<element>();
+	    this->e_sub_elements->splice(this->e_sub_elements->begin(), subs);
+	};
 	
 	pcre_context::capture_t e_capture;
 	data_token_t e_token;
+	
+	std::list<element> *e_sub_elements;
    };

    struct element_cmp {
-	bool operator()(data_token_t token, const element &b) const {
-	    return token == b.e_token;
+	bool operator()(data_token_t token, const element &elem) const {
+	    return token == elem.e_token || token == DT_ANY;
+	};
+	
+	bool operator()(const element &elem, data_token_t token) const {
+	    return (*this)(token, elem);
 	};
    };

@ -43,7 +74,7 @@ public:
 	
 	while (this->dp_scanner->tokenize(pc, elem.e_token)) {
 	    elem.e_capture = *(pc.begin());
-	    
+
 	    this->reduce(elem);
 	}
    };
@ -55,24 +86,85 @@ public:
 		       const data_token_t *pattern_end,
 		       bool repeating = false) {
 	size_t pattern_size = (pattern_end - pattern_start);
+	bool found, retval = false;
+
+	reduction.clear();
+
+	do {
+	    found = false;
+	    if (pattern_size <= this->dp_stack.size() &&
+		std::equal(pattern_start, pattern_end,
+			   this->dp_stack.begin(),
+			   element_cmp())) {
+		std::list<element>::iterator match_end = this->dp_stack.begin();
+		
+		advance(match_end, pattern_size);
+		reduction.splice(reduction.end(),
+				 this->dp_stack,
+				 this->dp_stack.begin(),
+				 match_end);
+
+		retval = found = true;
+	    }
+	} while (found && repeating);
+
+	reduction.reverse();
+	
+	return retval;
+    };
+
+    bool reduceUpTo(std::list<element> &reduction,
+		    const data_token_t *possibilities_start,
+		    const data_token_t *possibilities_end) {
+	size_t poss_size = (possibilities_end - possibilities_start);
+	std::list<element>::iterator iter;
 	bool retval = false;

 	reduction.clear();
-	if (pattern_size <= this->dp_stack.size() &&
-	    std::equal(pattern_start, pattern_end,
-		       this->dp_stack.begin(),
-		       element_cmp())) {
-	    std::list<element>::iterator match_end = this->dp_stack.begin();

-	    advance(match_end, pattern_size);
-	    reduction.splice(reduction.begin(),
+	iter = std::find_first_of(this->dp_stack.begin(), this->dp_stack.end(),
+				  possibilities_start, possibilities_end,
+				  element_cmp());
+	if (iter != this->dp_stack.end()) {
+	    reduction.splice(reduction.end(),
 			     this->dp_stack,
 			     this->dp_stack.begin(),
-			     match_end);
+			     iter);
+
+	    retval = true;
 	}

+	reduction.reverse();
+	
 	return retval;
    };
+
+    void reduceAggregate(void);
+    void reducePair(void);
+
+    void print(void) {
+	for (std::list<data_parser::element>::iterator iter = this->dp_stack.begin();
+	     iter != this->dp_stack.end();
+	     ++iter) {
+	    printf("%d %d:%d %s\n",
+		   iter->e_token,
+		   iter->e_capture.c_begin,
+		   iter->e_capture.c_end,
+		   this->dp_scanner->get_input().get_substr(&iter->e_capture).c_str());
+	    if (iter->e_sub_elements != NULL) {
+		for (std::list<data_parser::element>::iterator iter2 =
+			 iter->e_sub_elements->begin();
+		     iter2 != iter->e_sub_elements->end();
+		     ++iter2) {
+		    printf("  %d %d:%d %s\n",
+			   iter2->e_token,
+			   iter2->e_capture.c_begin,
+			   iter2->e_capture.c_end,
+			   this->dp_scanner->get_input().get_substr(&iter2->e_capture).c_str());
+		}
+	    }
+	}
+    };
    
    std::list<element> dp_stack;
    
--- a/src/data_scanner.hh
+++ b/src/data_scanner.hh
@ -40,8 +40,8 @@ enum data_token_t {
    DNT_MEASUREMENT,
    DNT_VARIABLE_KEY,
    DNT_ROWRANGE,
-	
-    T_ANY = 100,
+    
+    DT_ANY = 100,
 };

 class data_scanner {
@ -53,6 +53,8 @@ public:

    bool tokenize(pcre_context &pc, data_token_t &token_out);

+    pcre_input &get_input() { return this->ds_pcre_input; };
+
 private:
    std::string ds_line;
    pcre_input ds_pcre_input;
--- a/src/pcrepp.hh
+++ b/src/pcrepp.hh
@ -38,7 +38,10 @@
 */
 class pcre_context {
 public:
-    typedef struct {
+    typedef struct capture {
+	capture() { };
+	capture(int begin, int end) : c_begin(begin), c_end(end) { };
+	
 	int c_begin;
 	int c_end;

--- a/test/drive_data_scanner.cc
+++ b/test/drive_data_scanner.cc
@ -7,12 +7,15 @@
 #include "data_scanner.hh"
 #include "data_parser.hh"

+using namespace std;
+
 int main(int argc, char *argv[])
 {
    pcre_context_static<30> pc;

-    data_scanner ds("a=1 b=2");
-    data_scanner ds2("a=1 b=2");
+    data_scanner ds("a=1 b=2\n");
+    //data_scanner ds2("a=1 b=2  c=3,4\n");
+    data_scanner ds2("c=3,4\n");
    data_token_t token;
    
    while (ds.tokenize(pc, token)) {
@ -24,4 +27,27 @@ int main(int argc, char *argv[])
    data_parser dp(&ds2);
    
    dp.parse();
+
+    printf("done\n");
+    for (list<data_parser::element>::iterator iter = dp.dp_stack.begin();
+	 iter != dp.dp_stack.end();
+	 ++iter) {
+	printf("%d %d:%d %s\n",
+	       iter->e_token,
+	       iter->e_capture.c_begin,
+	       iter->e_capture.c_end,
+	       ds2.get_input().get_substr(&iter->e_capture).c_str());
+	if (iter->e_sub_elements != NULL) {
+	    for (list<data_parser::element>::iterator iter2 =
+		     iter->e_sub_elements->begin();
+		 iter2 != iter->e_sub_elements->end();
+		 ++iter2) {
+		printf("  %d %d:%d %s\n",
+		       iter2->e_token,
+		       iter2->e_capture.c_begin,
+		       iter2->e_capture.c_end,
+		       ds2.get_input().get_substr(&iter2->e_capture).c_str());
+	    }
+	}
+    }
 }