1
1
mirror of https://github.com/tstack/lnav.git synced 2024-10-26 13:16:11 +03:00

[log_format] try to do a better job matching JSON formats to logs

Try all formats and find the one with the highest quality
match.  For JSON logs, this means the format that matched
the most fields in the line-format.

Related to #1136
This commit is contained in:
Tim Stack 2023-06-20 09:38:11 -07:00
parent a1cfe34a7b
commit 1c8c7089cb
7 changed files with 209 additions and 168 deletions

View File

@ -432,6 +432,7 @@ struct json_log_userdata {
this->jlu_valid_utf = false;
}
this->jlu_sub_line_count += res.vlcr_count;
this->jlu_quality += res.vlcr_line_format_count;
}
external_log_format* jlu_format{nullptr};
@ -444,6 +445,7 @@ struct json_log_userdata {
const char* jlu_line_value{nullptr};
size_t jlu_line_size{0};
size_t jlu_sub_start{0};
uint32_t jlu_quality{0};
shared_buffer_ref& jlu_shared_buffer;
scan_batch_context* jlu_batch_context;
};
@ -719,13 +721,13 @@ external_log_format::scan(logfile& lf,
if (!line_frag.startswith("{")) {
if (!this->lf_specialized) {
return log_format::SCAN_NO_MATCH;
return log_format::scan_no_match{"line is not a JSON object"};
}
ll.set_time(dst.back().get_timeval());
ll.set_level(LEVEL_INVALID);
dst.emplace_back(ll);
return log_format::SCAN_MATCH;
return log_format::scan_match{0};
}
auto& ypc = *(this->jlf_parse_context);
@ -733,7 +735,7 @@ external_log_format::scan(logfile& lf,
json_log_userdata jlu(sbr, &sbc);
if (!this->lf_specialized && dst.size() >= 3) {
return log_format::SCAN_NO_MATCH;
return log_format::scan_no_match{"file is not JSON-lines"};
}
if (li.li_partial) {
@ -743,7 +745,7 @@ external_log_format::scan(logfile& lf,
ll.set_level(LEVEL_INVALID);
dst.emplace_back(ll);
}
return log_format::SCAN_INCOMPLETE;
return log_format::scan_incomplete{};
}
const auto* line_data = (const unsigned char*) sbr.get_data();
@ -768,11 +770,11 @@ external_log_format::scan(logfile& lf,
if (this->lf_specialized) {
ll.set_ignore(true);
dst.emplace_back(ll);
return log_format::SCAN_MATCH;
return log_format::scan_match{0};
}
log_debug("no match! %.*s", sbr.length(), line_data);
return log_format::SCAN_NO_MATCH;
return log_format::scan_no_match{
"JSON message does not have expected timestamp property"};
}
jlu.jlu_sub_line_count += this->jlf_line_format_init_count;
@ -801,7 +803,7 @@ external_log_format::scan(logfile& lf,
yajl_free_error(handle, msg);
}
if (!this->lf_specialized) {
return log_format::SCAN_NO_MATCH;
return log_format::scan_no_match{"JSON parsing failed"};
}
for (int lpc = 0; lpc < line_count; lpc++) {
log_level_t level = LEVEL_INVALID;
@ -816,7 +818,7 @@ external_log_format::scan(logfile& lf,
}
}
return log_format::SCAN_MATCH;
return log_format::scan_match{jlu.jlu_quality};
}
int curr_fmt = -1, orig_lock = this->last_pattern_index();
@ -1027,7 +1029,7 @@ external_log_format::scan(logfile& lf,
}
this->lf_pattern_locks.emplace_back(lock_line, curr_fmt);
}
return log_format::SCAN_MATCH;
return log_format::scan_match{0};
}
if (this->lf_specialized && !this->lf_multiline) {
@ -1038,10 +1040,10 @@ external_log_format::scan(logfile& lf,
last_line.get_timeval(),
log_level_t::LEVEL_INVALID);
return log_format::SCAN_MATCH;
return log_format::scan_match{0};
}
return log_format::SCAN_NO_MATCH;
return log_format::scan_no_match{"no patterns matched"};
}
uint8_t
@ -2974,19 +2976,20 @@ external_log_format::value_line_count(const intern_string_t ist,
lvs.add_value(val.value());
}
}
if (iter->second->vd_meta.is_hidden()) {
retval.vlcr_count = 0;
return retval;
}
if (std::find_if(this->jlf_line_format.begin(),
this->jlf_line_format.end(),
json_field_cmp(json_log_field::VARIABLE, ist))
!= this->jlf_line_format.end())
{
retval.vlcr_line_format_count += 1;
retval.vlcr_count -= 1;
}
if (iter->second->vd_meta.is_hidden()) {
retval.vlcr_count = 0;
}
return retval;
}

View File

@ -354,12 +354,19 @@ public:
return false;
}
enum scan_result_t {
SCAN_MATCH,
SCAN_NO_MATCH,
SCAN_INCOMPLETE,
struct scan_match {
uint32_t sm_quality;
};
struct scan_no_match {
const char* snm_reason{nullptr};
};
struct scan_incomplete {};
using scan_result_t
= mapbox::util::variant<scan_match, scan_no_match, scan_incomplete>;
/**
* Scan a log line to see if it matches this log format.
*

View File

@ -284,6 +284,7 @@ public:
struct value_line_count_result {
size_t vlcr_count{1};
size_t vlcr_line_format_count{0};
bool vlcr_has_ansi{false};
bool vlcr_valid_utf{true};
};

View File

@ -127,10 +127,10 @@ class generic_log_format : public log_format {
}
dst.emplace_back(li.li_file_range.fr_offset, log_tv, level_val);
return SCAN_MATCH;
return scan_match{0};
}
return SCAN_NO_MATCH;
return scan_no_match{"no patterns matched"};
}
void annotate(uint64_t line_number,
@ -391,7 +391,7 @@ public:
for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
if (iter.index() == 0 && *iter == "#close") {
return SCAN_MATCH;
return scan_match{0};
}
if (iter.index() >= this->blf_field_defs.size()) {
@ -449,9 +449,9 @@ public:
}
}
dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0, opid);
return SCAN_MATCH;
return scan_match{0};
}
return SCAN_NO_MATCH;
return scan_no_match{};
}
scan_result_t scan(logfile& lf,
@ -470,14 +470,14 @@ public:
if (dst.empty() || dst.size() > 20 || sbr.empty()
|| sbr.get_data()[0] == '#')
{
return SCAN_NO_MATCH;
return scan_no_match{};
}
auto line_iter = dst.begin();
auto read_result = lf.read_line(line_iter);
if (read_result.isErr()) {
return SCAN_NO_MATCH;
return scan_no_match{"unable to read first line"};
}
auto line = read_result.unwrap();
@ -488,7 +488,7 @@ public:
.matches(PCRE2_NO_UTF_CHECK)
.ignore_error();
if (!match_res) {
return SCAN_NO_MATCH;
return scan_no_match{"cannot read separator header"};
}
this->clear();
@ -500,7 +500,7 @@ public:
auto next_read_result = lf.read_line(line_iter);
if (next_read_result.isErr()) {
return SCAN_NO_MATCH;
return scan_no_match{"unable to read header line"};
}
line = next_read_result.unwrap();
@ -608,7 +608,7 @@ public:
this->blf_format_name.clear();
this->lf_value_stats.clear();
return SCAN_NO_MATCH;
return scan_no_match{};
}
void annotate(uint64_t line_number,
@ -1019,7 +1019,7 @@ public:
}
dst.emplace_back(
li.li_file_range.fr_offset, 0, 0, LEVEL_IGNORE, 0);
return SCAN_MATCH;
return scan_match{0};
}
sf = sf.trim("\" \t");
@ -1088,10 +1088,10 @@ public:
}
}
dst.emplace_back(li.li_file_range.fr_offset, tv, level, 0);
return SCAN_MATCH;
return scan_match{0};
}
return SCAN_NO_MATCH;
return scan_no_match{};
}
scan_result_t scan(logfile& lf,
@ -1105,7 +1105,7 @@ public:
static auto X_FIELDS_IDX = 0;
if (li.li_partial) {
return SCAN_INCOMPLETE;
return scan_incomplete{};
}
if (!this->wlf_format_name.empty()) {
@ -1115,7 +1115,7 @@ public:
if (dst.empty() || dst.size() > 20 || sbr.empty()
|| sbr.get_data()[0] == '#')
{
return SCAN_NO_MATCH;
return scan_no_match{};
}
this->clear();
@ -1125,7 +1125,7 @@ public:
auto next_read_result = lf.read_line(line_iter);
if (next_read_result.isErr()) {
return SCAN_NO_MATCH;
return scan_no_match{"unable to read first line"};
}
auto line = next_read_result.unwrap();
@ -1233,7 +1233,7 @@ public:
this->wlf_format_name.clear();
this->lf_value_stats.clear();
return SCAN_NO_MATCH;
return scan_no_match{};
}
void annotate(uint64_t line_number,
@ -1585,7 +1585,7 @@ public:
scan_batch_context& sbc) override
{
auto p = logfmt::parser(sbr.to_string_fragment());
scan_result_t retval = scan_result_t::SCAN_NO_MATCH;
scan_result_t retval = scan_no_match{};
bool done = false;
logfmt_pair_handler lph(this->lf_date_time);
@ -1655,7 +1655,7 @@ public:
if (lph.lph_found_time) {
dst.emplace_back(
li.li_file_range.fr_offset, lph.lph_tv, lph.lph_level);
retval = scan_result_t::SCAN_MATCH;
retval = scan_match{0};
}
return retval;

View File

@ -212,7 +212,7 @@ logfile::process_prefix(shared_buffer_ref& sbr,
= injector::get<const lnav::logfile::config&>()
.lc_max_unrecognized_lines;
log_format::scan_result_t found = log_format::SCAN_NO_MATCH;
log_format::scan_result_t found = log_format::scan_no_match{};
size_t prescan_size = this->lf_index.size();
time_t prescan_time = 0;
bool retval = false;
@ -225,177 +225,204 @@ logfile::process_prefix(shared_buffer_ref& sbr,
found = this->lf_format->scan(*this, this->lf_index, li, sbr, sbc);
} else if (this->lf_options.loo_detect_format) {
const auto& root_formats = log_format::get_root_formats();
nonstd::optional<std::pair<log_format*, log_format::scan_match>>
best_match;
/*
* Try each scanner until we get a match. Fortunately, the formats
* tend to be sufficiently different that there are few ambiguities...
*/
for (auto iter = root_formats.begin();
iter != root_formats.end() && (found != log_format::SCAN_MATCH);
++iter)
{
log_info("logfile[%s]: scanning line %d (offset: %d; size: %d)",
this->lf_filename.c_str(),
this->lf_index.size(),
li.li_file_range.fr_offset,
li.li_file_range.fr_size);
for (const auto& curr : root_formats) {
if (this->lf_index.size()
>= (*iter)->lf_max_unrecognized_lines.value_or(
>= curr->lf_max_unrecognized_lines.value_or(
max_unrecognized_lines))
{
continue;
}
if (this->lf_mismatched_formats.count((*iter)->get_name()) > 0) {
if (this->lf_mismatched_formats.count(curr->get_name()) > 0) {
continue;
}
if (!(*iter)->match_name(this->lf_filename)) {
if (!curr->match_name(this->lf_filename)) {
if (li.li_file_range.fr_offset == 0) {
log_debug("(%s) does not match file name: %s",
(*iter)->get_name().get(),
curr->get_name().get(),
this->lf_filename.c_str());
}
this->lf_mismatched_formats.insert((*iter)->get_name());
this->lf_mismatched_formats.insert(curr->get_name());
continue;
}
if (!(*iter)->match_mime_type(this->lf_options.loo_file_format)) {
if (!curr->match_mime_type(this->lf_options.loo_file_format)) {
if (li.li_file_range.fr_offset == 0) {
log_debug("(%s) does not match file format: %s",
(*iter)->get_name().get(),
curr->get_name().get(),
fmt::to_string(this->lf_options.loo_file_format)
.c_str());
}
continue;
}
(*iter)->clear();
this->set_format_base_time(iter->get());
found = (*iter)->scan(*this, this->lf_index, li, sbr, sbc);
if (found == log_format::SCAN_MATCH) {
#if 0
require(this->lf_index.size() == 1 ||
(this->lf_index[this->lf_index.size() - 2] <
this->lf_index[this->lf_index.size() - 1]));
#endif
log_info("%s:%d:log format found -- %s",
this->lf_filename.c_str(),
this->lf_index.size(),
(*iter)->get_name().get());
curr->clear();
this->set_format_base_time(curr.get());
std::vector<logline> scan_backup = this->lf_index;
auto scan_res = curr->scan(*this, this->lf_index, li, sbr, sbc);
this->lf_text_format = text_format_t::TF_LOG;
this->lf_format = (*iter)->specialized();
this->set_format_base_time(this->lf_format.get());
this->lf_content_id
= hasher().update(sbr.get_data(), sbr.length()).to_string();
for (auto& td_pair : this->lf_format->lf_tag_defs) {
bool matches = td_pair.second->ftd_paths.empty();
for (const auto& pr : td_pair.second->ftd_paths) {
if (pr.matches(this->lf_filename.c_str())) {
matches = true;
break;
}
}
if (!matches) {
continue;
}
log_info("%s: found applicable tag definition /%s/tags/%s",
this->lf_filename.c_str(),
this->lf_format->get_name().get(),
td_pair.second->ftd_name.c_str());
this->lf_applicable_taggers.emplace_back(td_pair.second);
}
/*
* We'll go ahead and assume that any previous lines were
* written out at the same time as the last one, so we need to
* go back and update everything.
*/
auto& last_line = this->lf_index[this->lf_index.size() - 1];
for (size_t lpc = 0; lpc < this->lf_index.size() - 1; lpc++) {
if (this->lf_format->lf_multiline) {
this->lf_index[lpc].set_time(last_line.get_time());
this->lf_index[lpc].set_millis(last_line.get_millis());
scan_res.match(
[this, &curr, &best_match, &scan_backup](
const log_format::scan_match& sm) {
if (!best_match
|| sm.sm_quality > best_match->second.sm_quality)
{
log_info(
" scan with format (%s) matched with quality (%d)",
curr->get_name().c_str(),
sm.sm_quality);
best_match = std::make_pair(curr.get(), sm);
} else {
this->lf_index[lpc].set_ignore(true);
log_info(
" scan with format (%s) matched, but "
"is low quality (%d)",
curr->get_name().c_str(),
sm.sm_quality);
this->lf_index = std::move(scan_backup);
}
},
[curr](const log_format::scan_incomplete& si) {
log_info(
" scan with format (%s) is incomplete, "
"more data required",
curr->get_name().c_str());
},
[curr](const log_format::scan_no_match& snm) {
log_info(" scan with format (%s) does not match -- %s",
curr->get_name().c_str(),
snm.snm_reason);
});
}
if (best_match) {
auto winner = best_match.value();
auto* curr = winner.first;
log_info("%s:%d:log format found -- %s",
this->lf_filename.c_str(),
this->lf_index.size(),
curr->get_name().get());
this->lf_text_format = text_format_t::TF_LOG;
this->lf_format = curr->specialized();
this->set_format_base_time(this->lf_format.get());
this->lf_content_id
= hasher().update(sbr.get_data(), sbr.length()).to_string();
for (auto& td_pair : this->lf_format->lf_tag_defs) {
bool matches = td_pair.second->ftd_paths.empty();
for (const auto& pr : td_pair.second->ftd_paths) {
if (pr.matches(this->lf_filename.c_str())) {
matches = true;
break;
}
}
break;
if (!matches) {
continue;
}
log_info("%s: found applicable tag definition /%s/tags/%s",
this->lf_filename.c_str(),
this->lf_format->get_name().get(),
td_pair.second->ftd_name.c_str());
this->lf_applicable_taggers.emplace_back(td_pair.second);
}
/*
* We'll go ahead and assume that any previous lines were
* written out at the same time as the last one, so we need to
* go back and update everything.
*/
auto& last_line = this->lf_index[this->lf_index.size() - 1];
for (size_t lpc = 0; lpc < this->lf_index.size() - 1; lpc++) {
if (this->lf_format->lf_multiline) {
this->lf_index[lpc].set_time(last_line.get_time());
this->lf_index[lpc].set_millis(last_line.get_millis());
} else {
this->lf_index[lpc].set_ignore(true);
}
}
found = best_match->second;
}
}
switch (found) {
case log_format::SCAN_MATCH: {
if (!this->lf_index.empty()) {
auto& last_line = this->lf_index.back();
if (found.is<log_format::scan_match>()) {
if (!this->lf_index.empty()) {
auto& last_line = this->lf_index.back();
last_line.set_valid_utf(last_line.is_valid_utf()
&& li.li_valid_utf);
last_line.set_has_ansi(last_line.has_ansi() || li.li_has_ansi);
}
if (prescan_size > 0 && this->lf_index.size() >= prescan_size
&& prescan_time != this->lf_index[prescan_size - 1].get_time())
{
retval = true;
}
if (prescan_size > 0 && prescan_size < this->lf_index.size()) {
auto& second_to_last = this->lf_index[prescan_size - 1];
auto& latest = this->lf_index[prescan_size];
last_line.set_valid_utf(last_line.is_valid_utf()
&& li.li_valid_utf);
last_line.set_has_ansi(last_line.has_ansi() || li.li_has_ansi);
}
if (prescan_size > 0 && this->lf_index.size() >= prescan_size
&& prescan_time != this->lf_index[prescan_size - 1].get_time())
{
retval = true;
}
if (prescan_size > 0 && prescan_size < this->lf_index.size()) {
auto& second_to_last = this->lf_index[prescan_size - 1];
auto& latest = this->lf_index[prescan_size];
if (!second_to_last.is_ignored() && latest < second_to_last) {
if (this->lf_format->lf_time_ordered) {
this->lf_out_of_time_order_count += 1;
for (size_t lpc = prescan_size;
lpc < this->lf_index.size();
lpc++)
{
auto& line_to_update = this->lf_index[lpc];
if (!second_to_last.is_ignored() && latest < second_to_last) {
if (this->lf_format->lf_time_ordered) {
this->lf_out_of_time_order_count += 1;
for (size_t lpc = prescan_size; lpc < this->lf_index.size();
lpc++)
{
auto& line_to_update = this->lf_index[lpc];
line_to_update.set_time_skew(true);
line_to_update.set_time(second_to_last.get_time());
line_to_update.set_millis(
second_to_last.get_millis());
}
} else {
retval = true;
line_to_update.set_time_skew(true);
line_to_update.set_time(second_to_last.get_time());
line_to_update.set_millis(second_to_last.get_millis());
}
} else {
retval = true;
}
}
break;
}
case log_format::SCAN_NO_MATCH: {
log_level_t last_level = LEVEL_UNKNOWN;
time_t last_time = this->lf_index_time;
short last_millis = 0;
uint8_t last_mod = 0, last_opid = 0;
} else if (found.is<log_format::scan_no_match>()) {
log_level_t last_level = LEVEL_UNKNOWN;
time_t last_time = this->lf_index_time;
short last_millis = 0;
uint8_t last_mod = 0, last_opid = 0;
if (!this->lf_index.empty()) {
logline& ll = this->lf_index.back();
if (!this->lf_index.empty()) {
logline& ll = this->lf_index.back();
/*
* Assume this line is part of the previous one(s) and copy the
* metadata over.
*/
last_time = ll.get_time();
last_millis = ll.get_millis();
if (this->lf_format.get() != nullptr) {
last_level = (log_level_t) (ll.get_level_and_flags()
| LEVEL_CONTINUED);
}
last_mod = ll.get_module_id();
last_opid = ll.get_opid();
/*
* Assume this line is part of the previous one(s) and copy the
* metadata over.
*/
last_time = ll.get_time();
last_millis = ll.get_millis();
if (this->lf_format.get() != nullptr) {
last_level = (log_level_t) (ll.get_level_and_flags()
| LEVEL_CONTINUED);
}
this->lf_index.emplace_back(li.li_file_range.fr_offset,
last_time,
last_millis,
last_level,
last_mod,
last_opid);
this->lf_index.back().set_valid_utf(li.li_valid_utf);
this->lf_index.back().set_has_ansi(li.li_has_ansi);
break;
last_mod = ll.get_module_id();
last_opid = ll.get_opid();
}
case log_format::SCAN_INCOMPLETE:
break;
this->lf_index.emplace_back(li.li_file_range.fr_offset,
last_time,
last_millis,
last_level,
last_mod,
last_opid);
this->lf_index.back().set_valid_utf(li.li_valid_utf);
this->lf_index.back().set_has_ansi(li.li_has_ansi);
}
return retval;

View File

@ -172,8 +172,9 @@ main(int argc, char* argv[])
line_info li = {{13}};
(*iter)->clear();
if ((*iter)->scan(*lf, index, li, sbr, sbc)
== log_format::SCAN_MATCH)
if ((*iter)
->scan(*lf, index, li, sbr, sbc)
.is<log_format::scan_match>())
{
format = (*iter)->specialized();
found = true;

View File

@ -1,6 +1,8 @@
⋮ abc 48 def info - -
2013-09-06T20:00:48.124 TRACE trace test
@fields: { "lvl": "TRACE", "msg": "trace test"}
⋮ abc 49 def info - -
2013-09-06T20:00:48.124 TRACE trace test
@fields: { "lvl": "TRACE", "msg": "trace test"}
2013-09-06T20:00:49.124 INFO Starting up service
@fields: { "lvl": "INFO", "msg": "Starting up service"}
[offset: 186] {"ts": "2013-09-06T22:00:49.124817Z", "@fields": { "lvl": "INFO", "msg":
parse error: premature EOF