Extract body_domains and body_fqdns to jsonl (#1750)

* Extract body-domains and body-fqdns

* remvove port in domains

* Add test for domains extraction

* misc update

* improve domain regex

* fix test

* extract domain inside quotes

* sanitize urls

* fix test

* minor

* do not embed

* remove js variables fp + improve regex

---------

Co-authored-by: Tarun Koyalwar <tarun@projectdiscovery.io>
This commit is contained in:
Ramana Reddy 2024-06-23 01:01:43 +05:30 committed by GitHub
parent 72f4c2cef4
commit 9330887a58
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 182 additions and 5 deletions

3
.gitignore vendored
View File

@ -10,3 +10,6 @@ cmd/functional-test/httpx
cmd/functional-test/*.cfg
.devcontainer
/httpx
/dist
/resume.cfg

View File

@ -2,7 +2,9 @@ package httpx
import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
@ -64,7 +66,7 @@ func parsePotentialDomains(fqdns, domains map[string]struct{}, data string) {
// we extracts only potential domains
for _, t := range tokens {
if isPotentialDomain(t) {
if dn, err := publicsuffix.Parse(extractDomain(removeWildcards(t))); err == nil {
if dn, err := publicsuffix.Parse(extractDomain(t)); err == nil {
domains[dn.SLD+"."+dn.TLD] = struct{}{}
if dn.TRD != "" {
fqdns[dn.String()] = struct{}{}
@ -79,15 +81,17 @@ func isPotentialDomain(s string) bool {
}
func extractDomain(str string) string {
str = removeWildcards(str)
u := str
if !strings.Contains(str, "://") {
u = "https://" + str
}
u = sanitizeURL(u)
parsedURL, err := url.Parse(u)
if err != nil {
return str
return ""
}
return parsedURL.Host
return parsedURL.Hostname()
}
func removeWildcards(domain string) string {
@ -108,3 +112,12 @@ func removeWildcards(domain string) string {
}
return strings.Join(parts, ".")
}
var urlInvalidCharRegex = regexp.MustCompile(`[^\w-./:~]`)
func sanitizeURL(u string) string {
// Replace invalid characters with percent-encoded equivalents
return urlInvalidCharRegex.ReplaceAllStringFunc(u, func(match string) string {
return fmt.Sprintf("%%%02X", match[0])
})
}

113
common/httpx/domains.go Normal file
View File

@ -0,0 +1,113 @@
package httpx
import (
"regexp"
"strings"
"unicode"
mapsutil "github.com/projectdiscovery/utils/maps"
stringsutil "github.com/projectdiscovery/utils/strings"
"github.com/weppos/publicsuffix-go/publicsuffix"
)
const (
// group 1 is actual domain regex while group 0 and group 2 are used to filter out invalid matches (by skipping irrelevant contexts)
potentialDomainRegex = `(?:^|['"/@])` + `([a-z0-9]+[a-z0-9.-]*\.[a-z]{2,})` + `(?:['"/@]|$)`
)
var (
// potentialDomainsCompiled is a compiled regex for potential domains (aka domain names)
potentialDomainsCompiled = regexp.MustCompile(potentialDomainRegex)
defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".webp", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip", ".css", ".js", ".map", ".php", ".sheet", ".ms", ".wp", ".html", ".htm", ".md"}
suffixBlacklist = map[string]struct{}{}
)
type BodyDomain struct {
Fqdns []string `json:"body_fqdn,omitempty"`
Domains []string `json:"body_domains,omitempty"`
}
func (h *HTTPX) BodyDomainGrab(r *Response) *BodyDomain {
domains := make(map[string]struct{})
fqdns := make(map[string]struct{})
for _, tmp := range potentialDomainsCompiled.FindAllStringSubmatch(r.Raw, -1) {
// only interested in 1st group
if len(tmp) < 2 {
continue
}
d := tmp[1]
// minimal + known blacklist
if !isValidDomain(d) {
continue
}
// try to parse its tld
if !isValidTLD(d) {
continue
}
// get domain
val, err := publicsuffix.Domain(d)
if err != nil {
continue
}
if r.Input != val {
domains[val] = struct{}{}
}
if d != val && d != r.Input {
fqdns[d] = struct{}{}
}
}
return &BodyDomain{Domains: mapsutil.GetKeys(domains), Fqdns: mapsutil.GetKeys(fqdns)}
}
func isValidDomain(d string) bool {
parts := strings.Split(d, ".")
if len(parts) < 2 {
return false
}
// this is try when all parts are numeric
// in which this is not a valid domain (could be a ip or something else)
allnumeric := true
// traverse in reverse
for i := len(parts) - 1; i >= 0; i-- {
if _, ok := suffixBlacklist["."+parts[i]]; ok {
return false
}
// check for numeric
local:
for _, c := range parts[i] {
if !unicode.IsDigit(c) {
allnumeric = false
break local
}
}
}
if allnumeric {
// not a domain could be ip or something else
return false
}
// simple hack for android/ios package name
if stringsutil.HasPrefixAny(d, "com", "net", "io", "org") && !stringsutil.HasSuffixAny(d, "com", "net", "io", "org") {
return false
}
return true
}
func isValidTLD(domain string) bool {
rule := publicsuffix.DefaultList.Find(domain, publicsuffix.DefaultFindOptions)
if rule == nil || rule.Type != publicsuffix.NormalType {
return false
}
_, err := publicsuffix.ParseFromListWithOptions(publicsuffix.DefaultList, domain, &publicsuffix.FindOptions{DefaultRule: rule})
return err == nil
}
func init() {
for _, s := range defaultDenylist {
suffixBlacklist[s] = struct{}{}
}
}

View File

@ -0,0 +1,29 @@
package httpx
import (
_ "embed"
"sort"
"testing"
"github.com/stretchr/testify/require"
)
//go:embed test-data/hackerone.html
var rawResponse string
func TestBodyGrabDoamins(t *testing.T) {
ht, err := New(&DefaultOptions)
require.Nil(t, err)
resposne := &Response{
Raw: rawResponse,
}
bd := ht.BodyDomainGrab(resposne)
sort.Strings(bd.Domains)
sort.Strings(bd.Fqdns)
t.Run("body domain grab", func(t *testing.T) {
require.Equal(t, 24, len(bd.Domains))
require.Equal(t, 16, len(bd.Fqdns))
})
}

View File

@ -220,6 +220,7 @@ get_response:
}
var resp Response
resp.Input = req.Host
resp.Headers = httpresp.Header.Clone()
@ -313,6 +314,7 @@ get_response:
if h.Options.ExtractFqdn {
resp.CSPData = h.CSPGrab(&resp)
resp.BodyDomains = h.BodyDomainGrab(&resp)
}
// build the redirect flow by reverse cycling the response<-request chain

View File

@ -25,6 +25,6 @@ func TestDo(t *testing.T) {
require.Nil(t, err)
resp, err := ht.Do(req, UnsafeOptions{})
require.Nil(t, err)
require.Equal(t, 318, resp.ContentLength)
require.Greater(t, len(resp.Raw), 800)
})
}

View File

@ -10,6 +10,7 @@ import (
// Response contains the response to a server
type Response struct {
Input string // input that was given
StatusCode int
Headers map[string][]string
RawData []byte // undecoded data
@ -21,6 +22,7 @@ type Response struct {
Lines int
TLSData *clients.Response
CSPData *CSPData
BodyDomains *BodyDomain
HTTP2 bool
Pipeline bool
Duration time.Duration

File diff suppressed because one or more lines are too long

View File

@ -2211,6 +2211,10 @@ retry:
RequestRaw: requestDump,
Response: resp,
}
if resp.BodyDomains != nil {
result.Fqdns = resp.BodyDomains.Fqdns
result.Domains = resp.BodyDomains.Domains
}
return result
}

View File

@ -89,6 +89,8 @@ type Result struct {
ScreenshotPathRel string `json:"screenshot_path_rel,omitempty" csv:"screenshot_path_rel"`
KnowledgeBase map[string]interface{} `json:"knowledgebase,omitempty" csv:"knowledgebase"`
Resolvers []string `json:"resolvers,omitempty" csv:"resolvers"`
Fqdns []string `json:"body_fqdn,omitempty"`
Domains []string `json:"body_domains,omitempty"`
// Internal Fields
TechnologyDetails map[string]wappalyzer.AppInfo `json:"-" csv:"-"`