diff --git a/CHANGELOG.md b/CHANGELOG.md index cb370101e..34109d215 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Fixed [long URLs display](https://github.com/plausible/analytics/issues/3158) in Outbound Link breakdown view - Fixed [Sentry reports](https://github.com/plausible/analytics/discussions/3166) for ingestion requests plausible/analytics#3182 - Fix breakdown pagination bug in the dashboard details view when filtering by goals +- Update bot detection (matomo 6.1.4, ua_inspector 3.4.0) ## v2.0.0 - 2023-07-12 diff --git a/config/config.exs b/config/config.exs index f226f62eb..0ca8e5977 100644 --- a/config/config.exs +++ b/config/config.exs @@ -22,7 +22,8 @@ config :logger, :console, config :phoenix, :json_library, Jason config :ua_inspector, - database_path: "priv/ua_inspector" + database_path: "priv/ua_inspector", + remote_release: "66d80de32fbb265941f4d7941fadc19097375097" config :ref_inspector, database_path: "priv/ref_inspector" diff --git a/mix.lock b/mix.lock index fe0c9895b..6634da11c 100644 --- a/mix.lock +++ b/mix.lock @@ -135,7 +135,7 @@ "timex": {:hex, :timex, "3.7.11", "bb95cb4eb1d06e27346325de506bcc6c30f9c6dea40d1ebe390b262fad1862d1", [:mix], [{:combine, "~> 0.10", [hex: :combine, repo: "hexpm", optional: false]}, {:gettext, "~> 0.20", [hex: :gettext, repo: "hexpm", optional: false]}, {:tzdata, "~> 1.1", [hex: :tzdata, repo: "hexpm", optional: false]}], "hexpm", "8b9024f7efbabaf9bd7aa04f65cf8dcd7c9818ca5737677c7b76acbc6a94d1aa"}, "tls_certificate_check": {:hex, :tls_certificate_check, "1.15.0", "1c0377617a1111000bca3f4cd530b62690c9bd2dc9b868b4459203cd4d7f16ab", [:rebar3], [{:ssl_verify_fun, "1.1.6", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "87fd2e865078fdf8913a8c27bd8fe2be986383e31011f21d7f92cc5f7bc90731"}, "tzdata": {:hex, :tzdata, "1.1.1", "20c8043476dfda8504952d00adac41c6eda23912278add38edc140ae0c5bcc46", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "a69cec8352eafcd2e198dea28a34113b60fdc6cb57eb5ad65c10292a6ba89787"}, - "ua_inspector": {:hex, :ua_inspector, "3.2.1", "16e249f276cbf19c9137dccf485f73a49119a5cc7d4f240a143fdb3191608c34", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:yamerl, "~> 0.7", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "f3bce513b2452a2891fce08f7e2ab74875ccdd7c920ff52e61862b0481e248f9"}, + "ua_inspector": {:hex, :ua_inspector, "3.4.0", "9410b51f9aeda5074da3f4f32553f3bc20b6463869a3822db1ee08aa6d0afbb9", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:yamerl, "~> 0.7", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "4fb3d9283621935a6f5158c12e30ce7ac18e004f6f11e05f5e3ae9ef8beb7022"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, "unsafe": {:hex, :unsafe, "1.0.1", "a27e1874f72ee49312e0a9ec2e0b27924214a05e3ddac90e91727bc76f8613d8", [:mix], [], "hexpm", "6c7729a2d214806450d29766abc2afaa7a2cbecf415be64f36a6691afebb50e5"}, "websock": {:hex, :websock, "0.5.2", "b3c08511d8d79ed2c2f589ff430bd1fe799bb389686dafce86d28801783d8351", [:mix], [], "hexpm", "925f5de22fca6813dfa980fb62fd542ec43a2d1a1f83d2caec907483fe66ff05"}, diff --git a/priv/ua_inspector/bot.bots.yml b/priv/ua_inspector/bot.bots.yml index c7746440f..33a0194fe 100644 --- a/priv/ua_inspector/bot.bots.yml +++ b/priv/ua_inspector/bot.bots.yml @@ -712,7 +712,15 @@ name: 'Visual Meta' url: 'https://www.shopalike.cz/' -- regex: 'AdsBot-Google|Adwords-(DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(Ads-Conversions|Ads-Qualify|Adwords|AMPHTML|Assess|HotelAdsVerifier|Read-Aloud|Shopping-Quality|Site-Verification|speakr|Stale-Content-Probe|Test|Youtube-Links)|(APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Googlebot|Google(?:AdSenseInfeed|AssociationService|Prober|Producer)|Google.*/\+/web/snippet' +- regex: 'AdsBot-Google|Adwords-(DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(Ads-Conversions|Ads-Qualify|Adwords|AMPHTML|Assess|HotelAdsVerifier|InspectionTool|Read-Aloud|Shopping-Quality|Site-Verification|speakr|Stale-Content-Probe|Test|Youtube-Links)|(APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Googlebot|Google(?:AdSenseInfeed|AssociationService|Other|Prober|Producer)|Google.*/\+/web/snippet' + name: 'Googlebot' + category: 'Search bot' + url: 'http://www.google.com/bot.html' + producer: + name: 'Google Inc.' + url: 'http://www.google.com' + +- regex: '^Google$' name: 'Googlebot' category: 'Search bot' url: 'http://www.google.com/bot.html' @@ -735,6 +743,11 @@ name: 'HubSpot Inc.' url: 'https://www.hubspot.com' +- regex: 'vuhuvBot' + name: 'Vuhuv Bot' + category: 'Crawler' + url: 'http://vuhuv.com/bot.html' + - regex: 'HTTPMon' name: 'HTTPMon' category: 'Site Monitor' @@ -1028,6 +1041,14 @@ - regex: 'Octopus [0-9]' name: 'Octopus' +- regex: 'OnlineOrNot.com_bot' + name: 'OnlineOrNot Bot' + category: 'Site Monitor' + url: 'https://onlineornot.com/website-monitoring' + producer: + name: 'OnlineOrNot' + url: 'https://onlineornot.com' + - regex: 'omgili' name: 'Omgili bot' category: 'Search bot' @@ -1749,7 +1770,15 @@ - regex: 'Y!J-BRW' name: 'Yahoo! Japan BRW' category: 'Crawler' - url: 'https://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/~/ウェブページにアクセスするシステムのユーザーエージェントについて' + url: 'https://support.yahoo-net.jp/PccSearch/s/article/H000007955' + producer: + name: 'Yahoo! Japan Corp.' + url: 'https://www.yahoo.co.jp/' + +- regex: 'Y!J-WSC' + name: 'Yahoo! Japan WSC' + category: 'Crawler' + url: 'https://support.yahoo-net.jp/PccSearch/s/article/H000007955' producer: name: 'Yahoo! Japan Corp.' url: 'https://www.yahoo.co.jp/' @@ -1974,7 +2003,7 @@ - regex: 'RSSRadio \(Push Notification Scanner;support@dorada\.co\.uk\)' name: 'RSSRadio Bot' -- regex: '(A6-Indexer|nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr.com|tweetedtimes.com|TrendsmapResolver|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|RackspaceBot|robots|SeopultContentAnalyzer|7Siters|centuryb.o.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|My User Agent|cortex|CF-UC User Agent|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|daumoa,damoa,daum,daumos,duamoa,duam,duamos|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|custom_user_agent|Test Certificate Info|iplabel)' +- regex: '(A6-Indexer|nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr.com|tweetedtimes.com|TrendsmapResolver|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|RackspaceBot|robots|SeopultContentAnalyzer|7Siters|centuryb.o.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|My User Agent|cortex|CF-UC User Agent|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|daumoa,damoa,daum,daumos,duamoa,duam,duamos|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|custom_user_agent|Test Certificate Info|iplabel|Magellan)' name: 'Generic Bot' - regex: '^sentry' @@ -2033,7 +2062,15 @@ name: 'WooRank sprl' url: 'https://www.woorank.com/' -- regex: '(Match|LinkCheck) by Siteimprove.com' +- regex: 'by Siteimprove\.com' + name: 'Siteimprove' + category: 'Search bot' + url: 'https://siteimprove.com/' + producer: + name: 'Siteimprove GmbH' + url: 'https://siteimprove.com/' + +- regex: 'Image size by Siteimprove\.com' name: 'Siteimprove' category: 'Search bot' url: 'https://siteimprove.com/' @@ -2161,6 +2198,14 @@ name: 'Startpagina B.V.' url: 'https://www.startpagina.nl/' +- regex: 'MoodleBot-Linkchecker' + name: 'MoodleBot Linkchecker' + category: 'Search bot' + url: 'hhttps://docs.moodle.org/en/Usage' + producer: + name: 'Moodle Pty Ltd' + url: 'https://moodle.org/' + - regex: 'GTmetrix' name: 'GTmetrix' category: 'Crawler' @@ -2420,6 +2465,13 @@ producer: name: 'Hatena Co., Ltd.' url: 'https://www.hatena.ne.jp' +- regex: 'Hatena-?Bookmark' + name: 'Hatena Bookmark' + category: 'Crawler' + url: 'https://www.hatena.ne.jp/faq/' + producer: + name: 'Hatena Co., Ltd.' + url: 'https://www.hatena.ne.jp' - regex: 'RyowlEngine/(\d+)' name: 'Ryowl' @@ -2564,6 +2616,14 @@ category: 'Security Checker' url: 'https://github.com/LeakIX/l9explore' +- regex: 'l9scan/|^Lkx-(.*)/([\d+.]+)' + name: 'LeakIX' + category: 'Security Checker' + url: 'https://leakix.net/' + producer: + name: 'BaDaaS SRL' + url: 'https://leakix.net/' + - regex: 'MegaIndex.ru/([\d+\.])' name: 'MegaIndex' category: 'Crawler' @@ -3055,14 +3115,6 @@ name: 'New Work SE' url: 'https://www.xing.com/' -- regex: '^Lkx-(.*)/([\d+.]+)' - name: 'LeakIX' - category: 'Security Checker' - url: 'https://leakix.net/' - producer: - name: 'BaDaaS SRL' - url: 'https://leakix.net/' - - regex: 'RepoLookoutBot/([\d+.]+)' name: 'Repo Lookout' category: 'Security Checker' @@ -3293,6 +3345,96 @@ name: 'Morningscore' url: 'https://morningscore.io/' +- regex: 'Uptime-Kuma/([\d+.]+)' + name: 'Uptime-Kuma' + category: 'Site Monitor' + url: 'https://github.com/louislam/uptime-kuma' + +- regex: 'ChatGPT-User' + name: 'ChatGPT' + category: 'Crawler' + url: 'https://platform.openai.com/docs/plugins/bot' + producer: + name: 'OpenAI OpCo, LLC' + url: 'https://openai.com/' + +- regex: 'BrightEdge Crawler/([\d+.]+)' + name: 'BrightEdge' + category: 'Crawler' + url: 'https://www.brightedge.com/' + producer: + name: 'BrightEdge Technologies, Inc' + url: 'https://www.brightedge.com/' + +- regex: 'sfFeedReader/([\d+.]+)' + name: 'sfFeedReader' + url: 'https://github.com/diem-project/sfFeed2Plugin' + category: 'Feed Fetcher' + +- regex: 'cyberscan.io' + name: 'Cyberscan' + category: 'Security Checker' + url: 'https://www.cyberscan.io/' + producer: + name: 'DGC Verwaltungs GmbH' + url: 'https://dgc.org/' + +- regex: 'deepcrawl\.com' + name: 'Lumar' + category: 'Crawler' + url: 'https://deepcrawl.com/bot' + producer: + name: 'Lumar' + url: 'https://www.lumar.io/' + +- regex: 'RepoLookoutBot' + name: 'Repo Lookout' + category: 'Crawler' + url: 'https://www.repo-lookout.org/' + producer: + name: 'Crissy Field GmbH' + url: 'https://www.crissyfield.de/' + +- regex: 'researchscan.comsys.rwth-aachen.de' + name: 'Research Scan' + category: 'Crawler' + url: 'http://researchscan.comsys.rwth-aachen.de/' + producer: + name: 'RWTH Aachen University' + url: 'https://www.comsys.rwth-aachen.de/' + +- regex: 'newspaper/([\d+.]+)' + name: 'Scraping Robot' + category: 'Crawler' + url: 'https://scrapingrobot.com/' + producer: + name: 'Sprious LLC' + url: 'https://sprious.com/' + +- regex: 'GPTBot/([\d+.]+)' + name: 'GPTBot' + category: 'Crawler' + url: 'https://platform.openai.com/docs/gptbot' + producer: + name: 'OpenAI OpCo, LLC' + url: 'https://openai.com/' + +- regex: 'Ant.com beta/([\d+.]+)' + name: 'Ant' + category: 'Crawler' + url: 'https://www.ant.com/' + producer: + name: 'Ant.com Ltd.' + url: 'https://www.ant.com/' + +- regex: 'WebwikiBot/([\d+.]+)' + name: 'Webwiki' + category: 'Crawler' + url: 'https://www.webwiki.com/' + producer: + name: 'webwiki GmbH' + url: 'https://www.webwiki.com/' + # Generic detections - regex: '[a-z0-9\-_]*((?