mirror of
https://github.com/projectdiscovery/httpx.git
synced 2024-11-24 05:05:54 +03:00
Extract body_domains
and body_fqdns
to jsonl (#1750)
* Extract body-domains and body-fqdns * remvove port in domains * Add test for domains extraction * misc update * improve domain regex * fix test * extract domain inside quotes * sanitize urls * fix test * minor * do not embed * remove js variables fp + improve regex --------- Co-authored-by: Tarun Koyalwar <tarun@projectdiscovery.io>
This commit is contained in:
parent
72f4c2cef4
commit
9330887a58
5
.gitignore
vendored
5
.gitignore
vendored
@ -9,4 +9,7 @@ cmd/functional-test/functional-test
|
||||
cmd/functional-test/httpx
|
||||
cmd/functional-test/*.cfg
|
||||
|
||||
.devcontainer
|
||||
.devcontainer
|
||||
/httpx
|
||||
/dist
|
||||
/resume.cfg
|
@ -2,7 +2,9 @@ package httpx
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
@ -64,7 +66,7 @@ func parsePotentialDomains(fqdns, domains map[string]struct{}, data string) {
|
||||
// we extracts only potential domains
|
||||
for _, t := range tokens {
|
||||
if isPotentialDomain(t) {
|
||||
if dn, err := publicsuffix.Parse(extractDomain(removeWildcards(t))); err == nil {
|
||||
if dn, err := publicsuffix.Parse(extractDomain(t)); err == nil {
|
||||
domains[dn.SLD+"."+dn.TLD] = struct{}{}
|
||||
if dn.TRD != "" {
|
||||
fqdns[dn.String()] = struct{}{}
|
||||
@ -79,15 +81,17 @@ func isPotentialDomain(s string) bool {
|
||||
}
|
||||
|
||||
func extractDomain(str string) string {
|
||||
str = removeWildcards(str)
|
||||
u := str
|
||||
if !strings.Contains(str, "://") {
|
||||
u = "https://" + str
|
||||
}
|
||||
u = sanitizeURL(u)
|
||||
parsedURL, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return str
|
||||
return ""
|
||||
}
|
||||
return parsedURL.Host
|
||||
return parsedURL.Hostname()
|
||||
}
|
||||
|
||||
func removeWildcards(domain string) string {
|
||||
@ -108,3 +112,12 @@ func removeWildcards(domain string) string {
|
||||
}
|
||||
return strings.Join(parts, ".")
|
||||
}
|
||||
|
||||
var urlInvalidCharRegex = regexp.MustCompile(`[^\w-./:~]`)
|
||||
|
||||
func sanitizeURL(u string) string {
|
||||
// Replace invalid characters with percent-encoded equivalents
|
||||
return urlInvalidCharRegex.ReplaceAllStringFunc(u, func(match string) string {
|
||||
return fmt.Sprintf("%%%02X", match[0])
|
||||
})
|
||||
}
|
||||
|
113
common/httpx/domains.go
Normal file
113
common/httpx/domains.go
Normal file
@ -0,0 +1,113 @@
|
||||
package httpx
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
mapsutil "github.com/projectdiscovery/utils/maps"
|
||||
stringsutil "github.com/projectdiscovery/utils/strings"
|
||||
"github.com/weppos/publicsuffix-go/publicsuffix"
|
||||
)
|
||||
|
||||
const (
|
||||
// group 1 is actual domain regex while group 0 and group 2 are used to filter out invalid matches (by skipping irrelevant contexts)
|
||||
potentialDomainRegex = `(?:^|['"/@])` + `([a-z0-9]+[a-z0-9.-]*\.[a-z]{2,})` + `(?:['"/@]|$)`
|
||||
)
|
||||
|
||||
var (
|
||||
// potentialDomainsCompiled is a compiled regex for potential domains (aka domain names)
|
||||
potentialDomainsCompiled = regexp.MustCompile(potentialDomainRegex)
|
||||
defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".webp", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip", ".css", ".js", ".map", ".php", ".sheet", ".ms", ".wp", ".html", ".htm", ".md"}
|
||||
suffixBlacklist = map[string]struct{}{}
|
||||
)
|
||||
|
||||
type BodyDomain struct {
|
||||
Fqdns []string `json:"body_fqdn,omitempty"`
|
||||
Domains []string `json:"body_domains,omitempty"`
|
||||
}
|
||||
|
||||
func (h *HTTPX) BodyDomainGrab(r *Response) *BodyDomain {
|
||||
domains := make(map[string]struct{})
|
||||
fqdns := make(map[string]struct{})
|
||||
|
||||
for _, tmp := range potentialDomainsCompiled.FindAllStringSubmatch(r.Raw, -1) {
|
||||
// only interested in 1st group
|
||||
if len(tmp) < 2 {
|
||||
continue
|
||||
}
|
||||
d := tmp[1]
|
||||
// minimal + known blacklist
|
||||
if !isValidDomain(d) {
|
||||
continue
|
||||
}
|
||||
// try to parse its tld
|
||||
if !isValidTLD(d) {
|
||||
continue
|
||||
}
|
||||
// get domain
|
||||
val, err := publicsuffix.Domain(d)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if r.Input != val {
|
||||
domains[val] = struct{}{}
|
||||
}
|
||||
if d != val && d != r.Input {
|
||||
fqdns[d] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
return &BodyDomain{Domains: mapsutil.GetKeys(domains), Fqdns: mapsutil.GetKeys(fqdns)}
|
||||
}
|
||||
|
||||
func isValidDomain(d string) bool {
|
||||
parts := strings.Split(d, ".")
|
||||
if len(parts) < 2 {
|
||||
return false
|
||||
}
|
||||
// this is try when all parts are numeric
|
||||
// in which this is not a valid domain (could be a ip or something else)
|
||||
allnumeric := true
|
||||
// traverse in reverse
|
||||
for i := len(parts) - 1; i >= 0; i-- {
|
||||
if _, ok := suffixBlacklist["."+parts[i]]; ok {
|
||||
return false
|
||||
}
|
||||
// check for numeric
|
||||
local:
|
||||
for _, c := range parts[i] {
|
||||
if !unicode.IsDigit(c) {
|
||||
allnumeric = false
|
||||
break local
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if allnumeric {
|
||||
// not a domain could be ip or something else
|
||||
return false
|
||||
}
|
||||
|
||||
// simple hack for android/ios package name
|
||||
if stringsutil.HasPrefixAny(d, "com", "net", "io", "org") && !stringsutil.HasSuffixAny(d, "com", "net", "io", "org") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func isValidTLD(domain string) bool {
|
||||
rule := publicsuffix.DefaultList.Find(domain, publicsuffix.DefaultFindOptions)
|
||||
if rule == nil || rule.Type != publicsuffix.NormalType {
|
||||
return false
|
||||
}
|
||||
|
||||
_, err := publicsuffix.ParseFromListWithOptions(publicsuffix.DefaultList, domain, &publicsuffix.FindOptions{DefaultRule: rule})
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
for _, s := range defaultDenylist {
|
||||
suffixBlacklist[s] = struct{}{}
|
||||
}
|
||||
}
|
29
common/httpx/domains_test.go
Normal file
29
common/httpx/domains_test.go
Normal file
@ -0,0 +1,29 @@
|
||||
package httpx
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
//go:embed test-data/hackerone.html
|
||||
var rawResponse string
|
||||
|
||||
func TestBodyGrabDoamins(t *testing.T) {
|
||||
ht, err := New(&DefaultOptions)
|
||||
require.Nil(t, err)
|
||||
resposne := &Response{
|
||||
Raw: rawResponse,
|
||||
}
|
||||
bd := ht.BodyDomainGrab(resposne)
|
||||
|
||||
sort.Strings(bd.Domains)
|
||||
sort.Strings(bd.Fqdns)
|
||||
|
||||
t.Run("body domain grab", func(t *testing.T) {
|
||||
require.Equal(t, 24, len(bd.Domains))
|
||||
require.Equal(t, 16, len(bd.Fqdns))
|
||||
})
|
||||
}
|
@ -220,6 +220,7 @@ get_response:
|
||||
}
|
||||
|
||||
var resp Response
|
||||
resp.Input = req.Host
|
||||
|
||||
resp.Headers = httpresp.Header.Clone()
|
||||
|
||||
@ -313,6 +314,7 @@ get_response:
|
||||
|
||||
if h.Options.ExtractFqdn {
|
||||
resp.CSPData = h.CSPGrab(&resp)
|
||||
resp.BodyDomains = h.BodyDomainGrab(&resp)
|
||||
}
|
||||
|
||||
// build the redirect flow by reverse cycling the response<-request chain
|
||||
|
@ -25,6 +25,6 @@ func TestDo(t *testing.T) {
|
||||
require.Nil(t, err)
|
||||
resp, err := ht.Do(req, UnsafeOptions{})
|
||||
require.Nil(t, err)
|
||||
require.Equal(t, 318, resp.ContentLength)
|
||||
require.Greater(t, len(resp.Raw), 800)
|
||||
})
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
|
||||
// Response contains the response to a server
|
||||
type Response struct {
|
||||
Input string // input that was given
|
||||
StatusCode int
|
||||
Headers map[string][]string
|
||||
RawData []byte // undecoded data
|
||||
@ -21,6 +22,7 @@ type Response struct {
|
||||
Lines int
|
||||
TLSData *clients.Response
|
||||
CSPData *CSPData
|
||||
BodyDomains *BodyDomain
|
||||
HTTP2 bool
|
||||
Pipeline bool
|
||||
Duration time.Duration
|
||||
|
9
common/httpx/test-data/hackerone.html
Normal file
9
common/httpx/test-data/hackerone.html
Normal file
File diff suppressed because one or more lines are too long
@ -2211,6 +2211,10 @@ retry:
|
||||
RequestRaw: requestDump,
|
||||
Response: resp,
|
||||
}
|
||||
if resp.BodyDomains != nil {
|
||||
result.Fqdns = resp.BodyDomains.Fqdns
|
||||
result.Domains = resp.BodyDomains.Domains
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
|
@ -89,6 +89,8 @@ type Result struct {
|
||||
ScreenshotPathRel string `json:"screenshot_path_rel,omitempty" csv:"screenshot_path_rel"`
|
||||
KnowledgeBase map[string]interface{} `json:"knowledgebase,omitempty" csv:"knowledgebase"`
|
||||
Resolvers []string `json:"resolvers,omitempty" csv:"resolvers"`
|
||||
Fqdns []string `json:"body_fqdn,omitempty"`
|
||||
Domains []string `json:"body_domains,omitempty"`
|
||||
|
||||
// Internal Fields
|
||||
TechnologyDetails map[string]wappalyzer.AppInfo `json:"-" csv:"-"`
|
||||
|
Loading…
Reference in New Issue
Block a user