From da500ce39335a91b8e925192b03ca1589ca35f0c Mon Sep 17 00:00:00 2001 From: Tim Stack Date: Tue, 14 Jun 2011 07:21:53 -0700 Subject: [PATCH] some more iterations on data extraction --- src/data_parser.cc | 103 +++++++++++++++++++++++++++++++- src/data_parser.hh | 116 +++++++++++++++++++++++++++++++++---- src/data_scanner.hh | 6 +- src/pcrepp.hh | 5 +- test/drive_data_scanner.cc | 30 +++++++++- 5 files changed, 240 insertions(+), 20 deletions(-) diff --git a/src/data_parser.cc b/src/data_parser.cc index 8f9ca280..9908c998 100644 --- a/src/data_parser.cc +++ b/src/data_parser.cc @@ -14,19 +14,116 @@ static data_token_t UPTO_SEPARATOR[] = { DT_LINE, }; +static data_token_t PATTERN_PAIR[] = { + DNT_ROW, + DT_SEPARATOR, + DNT_KEY, +}; + +static data_token_t PATTERN_AGGREGATE[] = { + DT_ANY, + DT_COMMA, + DNT_AGGREGATE, +}; + +void data_parser::reduceAggregate(void) +{ + std::list reduction; + + if (this->reducePattern(reduction, + PATTERN_AGGREGATE, + PATTERN_AGGREGATE + + sizeof(PATTERN_AGGREGATE) / sizeof(data_token_t))) { + struct element &top = this->dp_stack.front(); + + this->dp_stack.push_front(element(reduction, DNT_AGGREGATE)); + top.assign_elements(*reduction.front().e_sub_elements); + if (reduction.back().e_sub_elements != NULL) + top.assign_elements(*reduction.back().e_sub_elements); + else + top.e_sub_elements->push_back(reduction.back()); + } +} + +void data_parser::reducePair(void) +{ + std::list reduction; + + this->reduceAggregate(); + if (this->reduceUpTo(reduction, + UPTO_SEPARATOR, + UPTO_SEPARATOR + + sizeof(UPTO_SEPARATOR) / sizeof(data_token_t))) { + this->dp_stack.push_front(element(reduction, DNT_ROW)); + this->dp_stack.front().assign_elements(reduction); + } + + if (this->reducePattern(reduction, + PATTERN_PAIR, + PATTERN_PAIR + + sizeof(PATTERN_PAIR) / sizeof(data_token_t))) { + std::list::iterator middle = reduction.begin(); + + ++middle; + reduction.erase(middle); + this->dp_stack.push_front(element(reduction, DNT_PAIR)); + this->dp_stack.front().assign_elements(reduction); + } +} + void data_parser::reduce(const element &lookahead) { - struct element &top_elem = this->dp_stack.front(); + std::list reduction; + bool push_lookahead = true; switch (lookahead.e_token) { case DT_INVALID: + case DT_WHITE: + push_lookahead = false; break; - case DT_WHITE: + case DT_LINE: + this->reducePair(); + push_lookahead = false; + break; + + case DT_COMMA: + this->reduceAggregate(); + if (!this->dp_stack.empty() && + this->dp_stack.front().e_token != DNT_AGGREGATE) { + if (this->dp_stack.front().e_token == DT_SEPARATOR) { + push_lookahead = false; + } + else { + std::list::iterator next_elem = this->dp_stack.begin(); + + advance(next_elem, 1); + reduction.splice(reduction.end(), + this->dp_stack, + this->dp_stack.begin(), + next_elem); + this->dp_stack.push_front(element(reduction, DNT_AGGREGATE)); + this->dp_stack.front().assign_elements(reduction); + } + } break; case DT_SEPARATOR: - + if (this->reducePattern(reduction, + PATTERN_KEY, + PATTERN_KEY + + sizeof(PATTERN_KEY) / sizeof(data_token_t), + true)) { + this->reducePair(); + this->dp_stack.push_front(element(reduction, DNT_KEY)); + } break; } + + if (push_lookahead) { + this->dp_stack.push_front(lookahead); + } + + // this->print(); + printf("----\n"); } diff --git a/src/data_parser.hh b/src/data_parser.hh index dbaefc02..09ce1dd1 100644 --- a/src/data_parser.hh +++ b/src/data_parser.hh @@ -12,15 +12,46 @@ class data_parser { public: struct element { - element() : e_token(DT_INVALID) { }; + element() : e_token(DT_INVALID), e_sub_elements(NULL) { }; + element(std::list &subs, data_token_t token) + : e_capture(subs.front().e_capture.c_begin, + subs.back().e_capture.c_end), + e_token(token), + e_sub_elements(NULL) { + }; + + element(const element &other) { + assert(other.e_sub_elements == NULL); + + this->e_capture = other.e_capture; + this->e_token = other.e_token; + }; + + ~element() { + if (this->e_sub_elements != NULL) { + delete this->e_sub_elements; + this->e_sub_elements = NULL; + } + }; + + void assign_elements(std::list &subs) { + this->e_sub_elements = new std::list(); + this->e_sub_elements->splice(this->e_sub_elements->begin(), subs); + }; pcre_context::capture_t e_capture; data_token_t e_token; + + std::list *e_sub_elements; }; struct element_cmp { - bool operator()(data_token_t token, const element &b) const { - return token == b.e_token; + bool operator()(data_token_t token, const element &elem) const { + return token == elem.e_token || token == DT_ANY; + }; + + bool operator()(const element &elem, data_token_t token) const { + return (*this)(token, elem); }; }; @@ -43,7 +74,7 @@ public: while (this->dp_scanner->tokenize(pc, elem.e_token)) { elem.e_capture = *(pc.begin()); - + this->reduce(elem); } }; @@ -55,24 +86,85 @@ public: const data_token_t *pattern_end, bool repeating = false) { size_t pattern_size = (pattern_end - pattern_start); + bool found, retval = false; + + reduction.clear(); + + do { + found = false; + if (pattern_size <= this->dp_stack.size() && + std::equal(pattern_start, pattern_end, + this->dp_stack.begin(), + element_cmp())) { + std::list::iterator match_end = this->dp_stack.begin(); + + advance(match_end, pattern_size); + reduction.splice(reduction.end(), + this->dp_stack, + this->dp_stack.begin(), + match_end); + + retval = found = true; + } + } while (found && repeating); + + reduction.reverse(); + + return retval; + }; + + bool reduceUpTo(std::list &reduction, + const data_token_t *possibilities_start, + const data_token_t *possibilities_end) { + size_t poss_size = (possibilities_end - possibilities_start); + std::list::iterator iter; bool retval = false; reduction.clear(); - if (pattern_size <= this->dp_stack.size() && - std::equal(pattern_start, pattern_end, - this->dp_stack.begin(), - element_cmp())) { - std::list::iterator match_end = this->dp_stack.begin(); - advance(match_end, pattern_size); - reduction.splice(reduction.begin(), + iter = std::find_first_of(this->dp_stack.begin(), this->dp_stack.end(), + possibilities_start, possibilities_end, + element_cmp()); + if (iter != this->dp_stack.end()) { + reduction.splice(reduction.end(), this->dp_stack, this->dp_stack.begin(), - match_end); + iter); + + retval = true; } + reduction.reverse(); + return retval; }; + + void reduceAggregate(void); + void reducePair(void); + + void print(void) { + for (std::list::iterator iter = this->dp_stack.begin(); + iter != this->dp_stack.end(); + ++iter) { + printf("%d %d:%d %s\n", + iter->e_token, + iter->e_capture.c_begin, + iter->e_capture.c_end, + this->dp_scanner->get_input().get_substr(&iter->e_capture).c_str()); + if (iter->e_sub_elements != NULL) { + for (std::list::iterator iter2 = + iter->e_sub_elements->begin(); + iter2 != iter->e_sub_elements->end(); + ++iter2) { + printf(" %d %d:%d %s\n", + iter2->e_token, + iter2->e_capture.c_begin, + iter2->e_capture.c_end, + this->dp_scanner->get_input().get_substr(&iter2->e_capture).c_str()); + } + } + } + }; std::list dp_stack; diff --git a/src/data_scanner.hh b/src/data_scanner.hh index e8cc9a30..27d4da69 100644 --- a/src/data_scanner.hh +++ b/src/data_scanner.hh @@ -40,8 +40,8 @@ enum data_token_t { DNT_MEASUREMENT, DNT_VARIABLE_KEY, DNT_ROWRANGE, - - T_ANY = 100, + + DT_ANY = 100, }; class data_scanner { @@ -53,6 +53,8 @@ public: bool tokenize(pcre_context &pc, data_token_t &token_out); + pcre_input &get_input() { return this->ds_pcre_input; }; + private: std::string ds_line; pcre_input ds_pcre_input; diff --git a/src/pcrepp.hh b/src/pcrepp.hh index f0ad60b3..25ab09b6 100644 --- a/src/pcrepp.hh +++ b/src/pcrepp.hh @@ -38,7 +38,10 @@ */ class pcre_context { public: - typedef struct { + typedef struct capture { + capture() { }; + capture(int begin, int end) : c_begin(begin), c_end(end) { }; + int c_begin; int c_end; diff --git a/test/drive_data_scanner.cc b/test/drive_data_scanner.cc index 0bd3ae80..1dfeea85 100644 --- a/test/drive_data_scanner.cc +++ b/test/drive_data_scanner.cc @@ -7,12 +7,15 @@ #include "data_scanner.hh" #include "data_parser.hh" +using namespace std; + int main(int argc, char *argv[]) { pcre_context_static<30> pc; - data_scanner ds("a=1 b=2"); - data_scanner ds2("a=1 b=2"); + data_scanner ds("a=1 b=2\n"); + //data_scanner ds2("a=1 b=2 c=3,4\n"); + data_scanner ds2("c=3,4\n"); data_token_t token; while (ds.tokenize(pc, token)) { @@ -24,4 +27,27 @@ int main(int argc, char *argv[]) data_parser dp(&ds2); dp.parse(); + + printf("done\n"); + for (list::iterator iter = dp.dp_stack.begin(); + iter != dp.dp_stack.end(); + ++iter) { + printf("%d %d:%d %s\n", + iter->e_token, + iter->e_capture.c_begin, + iter->e_capture.c_end, + ds2.get_input().get_substr(&iter->e_capture).c_str()); + if (iter->e_sub_elements != NULL) { + for (list::iterator iter2 = + iter->e_sub_elements->begin(); + iter2 != iter->e_sub_elements->end(); + ++iter2) { + printf(" %d %d:%d %s\n", + iter2->e_token, + iter2->e_capture.c_begin, + iter2->e_capture.c_end, + ds2.get_input().get_substr(&iter2->e_capture).c_str()); + } + } + } }