AdGuardHome/internal/filtering/filter.go
Dimitry Kolyshev c54635e8a1 Pull request: 6020-rulelist-name
Updates #6020.

Squashed commit of the following:

commit fedb9415fb40d103261ca9b966c3d634692f899d
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Fri Aug 11 12:45:06 2023 +0300

    filtering: imp tests

commit d85d193ca7808e9089fa8ac3b26652f9c88c44ad
Merge: f1c1eddc1 94cf50a53
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Fri Aug 11 11:07:39 2023 +0300

    Merge remote-tracking branch 'origin/master' into 6020-rulelist-name

commit f1c1eddc113d2659adb666d7849ce0830eaf71f0
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Fri Aug 11 10:59:07 2023 +0300

    filtering: imp tests

commit 39e9d546dc2438409607ffebe414e9d656275504
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Fri Aug 11 10:02:48 2023 +0300

    filtering: imp code

commit 230f15ddad95c670e93c58db6d9928c3d0e0b79b
Merge: 1940fb397 111005b8d
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Thu Aug 10 15:03:08 2023 +0300

    Merge remote-tracking branch 'origin/master' into 6020-rulelist-name

commit 1940fb3973344a7d1ab8acfdc9401ed41fe0e666
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Thu Aug 10 15:01:57 2023 +0300

    all: docs

commit 810f6d17968873ce489b2e24f496d31179675e37
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Thu Aug 10 15:00:59 2023 +0300

    filtering: imp code

commit f310dd2281dc81cd816701696cf1bb289b4fb708
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Thu Aug 10 12:19:55 2023 +0300

    client: flt name

commit 9494771c57c464dbe5117315efdb3104b977bac4
Author: Dimitry Kolyshev <dkolyshev@adguard.com>
Date:   Thu Aug 10 12:18:57 2023 +0300

    filtering: flt name
2023-08-11 13:55:49 +03:00

669 lines
16 KiB
Go

package filtering
import (
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/AdguardTeam/AdGuardHome/internal/aghrenameio"
"github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist"
"github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/log"
"github.com/AdguardTeam/golibs/stringutil"
"golang.org/x/exp/slices"
)
// filterDir is the subdirectory of a data directory to store downloaded
// filters.
const filterDir = "filters"
// nextFilterID is a way to seed a unique ID generation.
//
// TODO(e.burkov): Use more deterministic approach.
var nextFilterID = time.Now().Unix()
// FilterYAML represents a filter list in the configuration file.
//
// TODO(e.burkov): Investigate if the field ordering is important.
type FilterYAML struct {
Enabled bool
URL string // URL or a file path
Name string `yaml:"name"`
RulesCount int `yaml:"-"`
LastUpdated time.Time `yaml:"-"`
checksum uint32 // checksum of the file data
white bool
Filter `yaml:",inline"`
}
// Clear filter rules
func (filter *FilterYAML) unload() {
filter.RulesCount = 0
filter.checksum = 0
}
// Path to the filter contents
func (filter *FilterYAML) Path(dataDir string) string {
return filepath.Join(dataDir, filterDir, strconv.FormatInt(filter.ID, 10)+".txt")
}
// ensureName sets provided title or default name for the filter if it doesn't
// have name already.
func (filter *FilterYAML) ensureName(title string) {
if filter.Name != "" {
return
}
if title != "" {
filter.Name = title
return
}
filter.Name = fmt.Sprintf("List %d", filter.ID)
}
const (
// errFilterNotExist is returned from [filterSetProperties] when there are
// no lists with the desired URL to update.
//
// TODO(e.burkov): Use wherever the same error is needed.
errFilterNotExist errors.Error = "url doesn't exist"
// errFilterExists is returned from [filterSetProperties] when there is
// another filter having the same URL as the one updated.
//
// TODO(e.burkov): Use wherever the same error is needed.
errFilterExists errors.Error = "url already exists"
)
// filterSetProperties searches for the particular filter list by url and sets
// the values of newList to it, updating afterwards if needed. It returns true
// if the update was performed and the filtering engine restart is required.
func (d *DNSFilter) filterSetProperties(
listURL string,
newList FilterYAML,
isAllowlist bool,
) (shouldRestart bool, err error) {
d.filtersMu.Lock()
defer d.filtersMu.Unlock()
filters := d.Filters
if isAllowlist {
filters = d.WhitelistFilters
}
i := slices.IndexFunc(filters, func(flt FilterYAML) bool { return flt.URL == listURL })
if i == -1 {
return false, errFilterNotExist
}
flt := &filters[i]
log.Debug(
"filtering: set name to %q, url to %s, enabled to %t for filter %s",
newList.Name,
newList.URL,
newList.Enabled,
flt.URL,
)
defer func(oldURL, oldName string, oldEnabled bool, oldUpdated time.Time, oldRulesCount int) {
if err != nil {
flt.URL = oldURL
flt.Name = oldName
flt.Enabled = oldEnabled
flt.LastUpdated = oldUpdated
flt.RulesCount = oldRulesCount
}
}(flt.URL, flt.Name, flt.Enabled, flt.LastUpdated, flt.RulesCount)
flt.Name = newList.Name
if flt.URL != newList.URL {
if d.filterExistsLocked(newList.URL) {
return false, errFilterExists
}
shouldRestart = true
flt.URL = newList.URL
flt.LastUpdated = time.Time{}
flt.unload()
}
if flt.Enabled != newList.Enabled {
flt.Enabled = newList.Enabled
shouldRestart = true
}
if flt.Enabled {
if shouldRestart {
// Download the filter contents.
shouldRestart, err = d.update(flt)
}
} else {
// TODO(e.burkov): The validation of the contents of the new URL is
// currently skipped if the rule list is disabled. This makes it
// possible to set a bad rules source, but the validation should still
// kick in when the filter is enabled. Consider changing this behavior
// to be stricter.
flt.unload()
}
return shouldRestart, err
}
// filterExists returns true if a filter with the same url exists in d. It's
// safe for concurrent use.
func (d *DNSFilter) filterExists(url string) (ok bool) {
d.filtersMu.RLock()
defer d.filtersMu.RUnlock()
r := d.filterExistsLocked(url)
return r
}
// filterExistsLocked returns true if d contains the filter with the same url.
// d.filtersMu is expected to be locked.
func (d *DNSFilter) filterExistsLocked(url string) (ok bool) {
for _, f := range d.Filters {
if f.URL == url {
return true
}
}
for _, f := range d.WhitelistFilters {
if f.URL == url {
return true
}
}
return false
}
// Add a filter
// Return FALSE if a filter with this URL exists
func (d *DNSFilter) filterAdd(flt FilterYAML) (err error) {
// Defer annotating to unlock sooner.
defer func() { err = errors.Annotate(err, "adding filter: %w") }()
d.filtersMu.Lock()
defer d.filtersMu.Unlock()
// Check for duplicates.
if d.filterExistsLocked(flt.URL) {
return errFilterExists
}
if flt.white {
d.WhitelistFilters = append(d.WhitelistFilters, flt)
} else {
d.Filters = append(d.Filters, flt)
}
return nil
}
// Load filters from the disk
// And if any filter has zero ID, assign a new one
func (d *DNSFilter) loadFilters(array []FilterYAML) {
for i := range array {
filter := &array[i] // otherwise we're operating on a copy
if filter.ID == 0 {
filter.ID = assignUniqueFilterID()
}
if !filter.Enabled {
// No need to load a filter that is not enabled
continue
}
err := d.load(filter)
if err != nil {
log.Error("filtering: loading filter %d: %s", filter.ID, err)
}
}
}
func deduplicateFilters(filters []FilterYAML) (deduplicated []FilterYAML) {
urls := stringutil.NewSet()
lastIdx := 0
for _, filter := range filters {
if !urls.Has(filter.URL) {
urls.Add(filter.URL)
filters[lastIdx] = filter
lastIdx++
}
}
return filters[:lastIdx]
}
// Set the next filter ID to max(filter.ID) + 1
func updateUniqueFilterID(filters []FilterYAML) {
for _, filter := range filters {
if nextFilterID < filter.ID {
nextFilterID = filter.ID + 1
}
}
}
// TODO(e.burkov): Improve this inexhaustible source of races.
func assignUniqueFilterID() int64 {
value := nextFilterID
nextFilterID++
return value
}
// Sets up a timer that will be checking for filters updates periodically
func (d *DNSFilter) periodicallyRefreshFilters() {
const maxInterval = 1 * 60 * 60
ivl := 5 // use a dynamically increasing time interval
for {
isNetErr, ok := false, false
if d.FiltersUpdateIntervalHours != 0 {
_, isNetErr, ok = d.tryRefreshFilters(true, true, false)
if ok && !isNetErr {
ivl = maxInterval
}
}
if isNetErr {
ivl *= 2
if ivl > maxInterval {
ivl = maxInterval
}
}
time.Sleep(time.Duration(ivl) * time.Second)
}
}
// tryRefreshFilters is like [refreshFilters], but backs down if the update is
// already going on.
//
// TODO(e.burkov): Get rid of the concurrency pattern which requires the
// [sync.Mutex.TryLock].
func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) {
if ok = d.refreshLock.TryLock(); !ok {
return 0, false, false
}
defer d.refreshLock.Unlock()
updated, isNetworkErr = d.refreshFiltersIntl(block, allow, force)
return updated, isNetworkErr, ok
}
// listsToUpdate returns the slice of filter lists that could be updated.
func (d *DNSFilter) listsToUpdate(filters *[]FilterYAML, force bool) (toUpd []FilterYAML) {
now := time.Now()
d.filtersMu.RLock()
defer d.filtersMu.RUnlock()
for i := range *filters {
flt := &(*filters)[i] // otherwise we will be operating on a copy
if !flt.Enabled {
continue
}
if !force {
exp := flt.LastUpdated.Add(time.Duration(d.FiltersUpdateIntervalHours) * time.Hour)
if now.Before(exp) {
continue
}
}
toUpd = append(toUpd, FilterYAML{
Filter: Filter{
ID: flt.ID,
},
URL: flt.URL,
Name: flt.Name,
checksum: flt.checksum,
})
}
return toUpd
}
func (d *DNSFilter) refreshFiltersArray(filters *[]FilterYAML, force bool) (int, []FilterYAML, []bool, bool) {
var updateFlags []bool // 'true' if filter data has changed
updateFilters := d.listsToUpdate(filters, force)
if len(updateFilters) == 0 {
return 0, nil, nil, false
}
failNum := 0
for i := range updateFilters {
uf := &updateFilters[i]
updated, err := d.update(uf)
updateFlags = append(updateFlags, updated)
if err != nil {
failNum++
log.Error("filtering: updating filter from url %q: %s\n", uf.URL, err)
continue
}
}
if failNum == len(updateFilters) {
return 0, nil, nil, true
}
updateCount := 0
d.filtersMu.Lock()
defer d.filtersMu.Unlock()
for i := range updateFilters {
uf := &updateFilters[i]
updated := updateFlags[i]
for k := range *filters {
f := &(*filters)[k]
if f.ID != uf.ID || f.URL != uf.URL {
continue
}
f.LastUpdated = uf.LastUpdated
if !updated {
continue
}
log.Info(
"filtering: updated filter %d; rule count: %d (was %d)",
f.ID,
uf.RulesCount,
f.RulesCount,
)
f.Name = uf.Name
f.RulesCount = uf.RulesCount
f.checksum = uf.checksum
updateCount++
}
}
return updateCount, updateFilters, updateFlags, false
}
// refreshFiltersIntl checks filters and updates them if necessary. If force is
// true, it ignores the filter.LastUpdated field value.
//
// Algorithm:
//
// 1. Get the list of filters to be updated. For each filter, run the download
// and checksum check operation. Store downloaded data in a temporary file
// inside data/filters directory
//
// 2. For each filter, if filter data hasn't changed, just set new update time
// on file. Otherwise, rename the temporary file (<temp> -> 1.txt). Note
// that this method works only on Unix systems. On Windows, don't pass
// files to filtering, pass the whole data.
//
// refreshFiltersIntl returns the number of updated filters. It also returns
// true if there was a network error and nothing could be updated.
//
// TODO(a.garipov, e.burkov): What the hell?
func (d *DNSFilter) refreshFiltersIntl(block, allow, force bool) (int, bool) {
updNum := 0
log.Debug("filtering: starting updating")
defer func() { log.Debug("filtering: finished updating, %d updated", updNum) }()
var lists []FilterYAML
var toUpd []bool
isNetErr := false
if block {
updNum, lists, toUpd, isNetErr = d.refreshFiltersArray(&d.Filters, force)
}
if allow {
updNumAl, listsAl, toUpdAl, isNetErrAl := d.refreshFiltersArray(&d.WhitelistFilters, force)
updNum += updNumAl
lists = append(lists, listsAl...)
toUpd = append(toUpd, toUpdAl...)
isNetErr = isNetErr || isNetErrAl
}
if isNetErr {
return 0, true
}
if updNum != 0 {
d.EnableFilters(false)
for i := range lists {
uf := &lists[i]
updated := toUpd[i]
if !updated {
continue
}
p := uf.Path(d.DataDir)
err := os.Remove(p + ".old")
if err != nil {
log.Debug("filtering: removing old filter file %q: %s", p, err)
}
}
}
return updNum, false
}
// update refreshes filter's content and a/mtimes of it's file.
func (d *DNSFilter) update(filter *FilterYAML) (b bool, err error) {
b, err = d.updateIntl(filter)
filter.LastUpdated = time.Now()
if !b {
chErr := os.Chtimes(
filter.Path(d.DataDir),
filter.LastUpdated,
filter.LastUpdated,
)
if chErr != nil {
log.Error("filtering: os.Chtimes(): %s", chErr)
}
}
return b, err
}
// updateIntl updates the flt rewriting it's actual file. It returns true if
// the actual update has been performed.
func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
log.Debug("filtering: downloading update for filter %d from %q", flt.ID, flt.URL)
var res *rulelist.ParseResult
// Change the default 0o600 permission to something more acceptable by end
// users.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/3198.
tmpFile, err := aghrenameio.NewPendingFile(flt.Path(d.DataDir), 0o644)
if err != nil {
return false, err
}
defer func() { err = d.finalizeUpdate(tmpFile, flt, res, err, ok) }()
r, err := d.reader(flt.URL)
if err != nil {
// Don't wrap the error since it's informative enough as is.
return false, err
}
defer func() { err = errors.WithDeferred(err, r.Close()) }()
bufPtr := d.bufPool.Get().(*[]byte)
defer d.bufPool.Put(bufPtr)
p := rulelist.NewParser()
res, err = p.Parse(tmpFile, r, *bufPtr)
return res.Checksum != flt.checksum && err == nil, err
}
// finalizeUpdate closes and gets rid of temporary file f with filter's content
// according to updated. It also saves new values of flt's name, rules number
// and checksum if succeeded.
func (d *DNSFilter) finalizeUpdate(
file aghrenameio.PendingFile,
flt *FilterYAML,
res *rulelist.ParseResult,
returned error,
updated bool,
) (err error) {
id := flt.ID
if !updated {
if returned == nil {
log.Debug("filtering: filter %d from url %q has no changes, skipping", id, flt.URL)
}
return errors.WithDeferred(returned, file.Cleanup())
}
log.Info("filtering: saving contents of filter %d into %q", id, flt.Path(d.DataDir))
err = file.CloseReplace()
if err != nil {
return fmt.Errorf("finalizing update: %w", err)
}
rulesCount := res.RulesCount
log.Info("filtering: updated filter %d: %d bytes, %d rules", id, res.BytesWritten, rulesCount)
flt.ensureName(res.Title)
flt.checksum = res.Checksum
flt.RulesCount = rulesCount
return nil
}
// reader returns an io.ReadCloser reading filtering-rule list data form either
// a file on the filesystem or the filter's HTTP URL.
func (d *DNSFilter) reader(fltURL string) (r io.ReadCloser, err error) {
if !filepath.IsAbs(fltURL) {
r, err = d.readerFromURL(fltURL)
if err != nil {
return nil, fmt.Errorf("reading from url: %w", err)
}
return r, nil
}
r, err = os.Open(fltURL)
if err != nil {
return nil, fmt.Errorf("opening file: %w", err)
}
return r, nil
}
// readerFromURL returns an io.ReadCloser reading filtering-rule list data form
// the filter's URL.
func (d *DNSFilter) readerFromURL(fltURL string) (r io.ReadCloser, err error) {
resp, err := d.HTTPClient.Get(fltURL)
if err != nil {
// Don't wrap the error since it's informative enough as is.
return nil, err
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK)
}
return resp.Body, nil
}
// loads filter contents from the file in dataDir
func (d *DNSFilter) load(flt *FilterYAML) (err error) {
fileName := flt.Path(d.DataDir)
log.Debug("filtering: loading filter %d from %q", flt.ID, fileName)
file, err := os.Open(fileName)
if errors.Is(err, os.ErrNotExist) {
// Do nothing, file doesn't exist.
return nil
} else if err != nil {
return fmt.Errorf("opening filter file: %w", err)
}
defer func() { err = errors.WithDeferred(err, file.Close()) }()
st, err := file.Stat()
if err != nil {
return fmt.Errorf("getting filter file stat: %w", err)
}
log.Debug("filtering: file %q, id %d, length %d", fileName, flt.ID, st.Size())
bufPtr := d.bufPool.Get().(*[]byte)
defer d.bufPool.Put(bufPtr)
p := rulelist.NewParser()
res, err := p.Parse(io.Discard, file, *bufPtr)
if err != nil {
return fmt.Errorf("parsing filter file: %w", err)
}
flt.ensureName(res.Title)
flt.RulesCount, flt.checksum, flt.LastUpdated = res.RulesCount, res.Checksum, st.ModTime()
return nil
}
func (d *DNSFilter) EnableFilters(async bool) {
d.filtersMu.RLock()
defer d.filtersMu.RUnlock()
d.enableFiltersLocked(async)
}
func (d *DNSFilter) enableFiltersLocked(async bool) {
filters := make([]Filter, 1, len(d.Filters)+len(d.WhitelistFilters)+1)
filters[0] = Filter{
ID: CustomListID,
Data: []byte(strings.Join(d.UserRules, "\n")),
}
for _, filter := range d.Filters {
if !filter.Enabled {
continue
}
filters = append(filters, Filter{
ID: filter.ID,
FilePath: filter.Path(d.DataDir),
})
}
var allowFilters []Filter
for _, filter := range d.WhitelistFilters {
if !filter.Enabled {
continue
}
allowFilters = append(allowFilters, Filter{
ID: filter.ID,
FilePath: filter.Path(d.DataDir),
})
}
err := d.setFilters(filters, allowFilters, async)
if err != nil {
log.Error("filtering: enabling filters: %s", err)
}
d.SetEnabled(d.FilteringEnabled)
}