Pull request 1924: 6003-relax-rule-validation

Updates #6003.

Squashed commit of the following:

commit 1874860877662999d158631e3a25f8072c24f155
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:36:26 2023 +0300

    filtering/rulelist: imp test

commit 871a41af8039bf4d4fb139622d4296bcaff6729c
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Jul 13 19:10:35 2023 +0300

    filtering/rulelist: relax validation
This commit is contained in:
Ainar Garipov 2023-07-13 19:43:53 +03:00
parent f22d893845
commit 2adc8624c0
4 changed files with 46 additions and 29 deletions

View File

@ -25,8 +25,9 @@ NOTE: Add new changes BELOW THIS COMMENT.
### Fixed
- `bufio.Scanner: token too long` errors when trying to add filtering-rule lists
with lines over 1024 bytes long ([#6003]).
- `bufio.Scanner: token too long` and other errors when trying to add
filtering-rule lists with lines over 1024 bytes long or containing cosmetic
rules ([#6003]).
### Removed

View File

@ -6,10 +6,9 @@ import (
"fmt"
"hash/crc32"
"io"
"unicode"
"unicode/utf8"
"github.com/AdguardTeam/golibs/errors"
"golang.org/x/exp/slices"
)
// Parser is a filtering-rule parser that collects data, such as the checksum
@ -105,13 +104,11 @@ func (p *Parser) processLine(dst io.Writer, line []byte, lineNum int) (n int, er
badIdx, isRule = p.parseLineTitle(trimmed)
}
if badIdx != -1 {
badRune, _ := utf8.DecodeRune(trimmed[badIdx:])
return 0, fmt.Errorf(
"line %d: character %d: non-printable character %q",
"line %d: character %d: likely binary character %q",
lineNum,
badIdx+bytes.Index(line, trimmed)+1,
badRune,
trimmed[badIdx],
)
}
@ -144,41 +141,37 @@ func hasPrefixFold(b, prefix []byte) (ok bool) {
}
// parseLine returns true if the parsed line is a filtering rule. line is
// assumed to be trimmed of whitespace characters. nonPrintIdx is the index of
// the first non-printable character, if any; if there are none, nonPrintIdx is
// -1.
// assumed to be trimmed of whitespace characters. badIdx is the index of the
// first character that may indicate that this is a binary file, or -1 if none.
//
// A line is considered a rule if it's not empty, not a comment, and contains
// only printable characters.
func parseLine(line []byte) (nonPrintIdx int, isRule bool) {
func parseLine(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' || line[0] == '!' {
return -1, false
}
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
badIdx = slices.IndexFunc(line, likelyBinary)
return nonPrintIdx, nonPrintIdx == -1
return badIdx, badIdx == -1
}
// isNotPrintable returns true if r is not a printable character that can be
// contained in a filtering rule.
func isNotPrintable(r rune) (ok bool) {
// Tab isn't included into Unicode's graphic symbols, so include it here
// explicitly.
return r != '\t' && !unicode.IsGraphic(r)
// likelyBinary returns true if b is likely to be a byte from a binary file.
func likelyBinary(b byte) (ok bool) {
return (b < ' ' || b == 0x7f) && b != '\n' && b != '\r' && b != '\t'
}
// parseLineTitle is like [parseLine] but additionally looks for a title. line
// is assumed to be trimmed of whitespace characters.
func (p *Parser) parseLineTitle(line []byte) (nonPrintIdx int, isRule bool) {
func (p *Parser) parseLineTitle(line []byte) (badIdx int, isRule bool) {
if len(line) == 0 || line[0] == '#' {
return -1, false
}
if line[0] != '!' {
nonPrintIdx = bytes.IndexFunc(line, isNotPrintable)
badIdx = slices.IndexFunc(line, likelyBinary)
return nonPrintIdx, nonPrintIdx == -1
return badIdx, badIdx == -1
}
const titlePattern = "! Title: "

View File

@ -77,6 +77,14 @@ func TestParser_Parse(t *testing.T) {
wantTitle: "Test Title",
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
}, {
name: "cosmetic_with_zwnj",
in: testRuleTextCosmetic,
wantDst: testRuleTextCosmetic,
wantErrMsg: "",
wantTitle: "",
wantRulesNum: 1,
wantWritten: len(testRuleTextCosmetic),
}, {
name: "bad_char",
in: "! Title: Test Title \n" +
@ -85,7 +93,7 @@ func TestParser_Parse(t *testing.T) {
wantDst: testRuleTextBlocked,
wantErrMsg: "line 3: " +
"character 4: " +
"non-printable character '\\x7f'",
"likely binary character '\\x7f'",
wantTitle: "Test Title",
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
@ -215,6 +223,14 @@ func BenchmarkParser_Parse(b *testing.B) {
require.NoError(b, errSink)
require.NotNil(b, resSink)
// Most recent result, on a ThinkPad X13 with a Ryzen Pro 7 CPU:
//
// goos: linux
// goarch: amd64
// pkg: github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist
// cpu: AMD Ryzen 7 PRO 4750U with Radeon Graphics
// BenchmarkParser_Parse-16 100000000 128.0 ns/op 48 B/op 1 allocs/op
}
func FuzzParser_Parse(f *testing.F) {
@ -226,15 +242,17 @@ func FuzzParser_Parse(f *testing.F) {
"! Comment",
"! Title ",
"! Title XXX",
testRuleTextBadTab,
testRuleTextBlocked,
testRuleTextCosmetic,
testRuleTextEtcHostsTab,
testRuleTextHTML,
testRuleTextBlocked,
testRuleTextBadTab,
"1.2.3.4",
"1.2.3.4 etc-hosts.example",
">>>\x00<<<",
">>>\x7F<<<",
strings.Repeat("a", n+1),
strings.Repeat("a", rulelist.DefaultRuleBufSize+1),
strings.Repeat("a", bufio.MaxScanTokenSize+1),
}
for _, tc := range testCases {

View File

@ -7,8 +7,13 @@ const testTimeout = 1 * time.Second
// Common texts for tests.
const (
testRuleTextHTML = "<!DOCTYPE html>\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
testRuleTextHTML = "<!DOCTYPE html>\n"
// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/6003.
testRuleTextCosmetic = "||cosmetic.example## :has-text(/\u200c/i)\n"
)