Improved title extraction

2024-12-01 12:13:00 +03:00 · 2020-12-21 00:44:16 +01:00 · 2020-12-21 00:44:16 +01:00 · 9ddeea3895
commit 9ddeea3895
parent 1dbf99c88e
1 changed files with 51 additions and 6 deletions
--- a/common/httpx/title.go
+++ b/common/httpx/title.go
@ -1,20 +1,36 @@
 package httpx

 import (
+	"bytes"
+	"fmt"
+	"io"
 	"regexp"
 	"strings"

 	"golang.org/x/net/html"
 )

+var (
+	reTitle       *regexp.Regexp = regexp.MustCompile(`(?im)<\s*title.*>(.*?)<\s*/\s*title>`)
+	reContentType *regexp.Regexp = regexp.MustCompile(`(?im)\s*charset="(.*?)"|charset=(.*?)"\s*`)
+)
+
 // ExtractTitle from a response
 func ExtractTitle(r *Response) (title string) {
-	var re = regexp.MustCompile(`(?im)<\s*title.*>(.*?)<\s*/\s*title>`)
-	for _, match := range re.FindAllString(r.Raw, -1) {
-		title = html.UnescapeString(trimTitleTags(match))
-		break
+	// Try to parse the DOM
+	titleDom, err := getTitleWithDom(r)
+	// In case of error fallback to regex
+	if err != nil {
+		for _, match := range reTitle.FindAllString(r.Raw, -1) {
+			title = match
+			break
+		}
+	} else {
+		title = renderNode(titleDom)
 	}

+	title = html.UnescapeString(trimTitleTags(title))
+
 	// Non UTF-8
 	if contentTypes, ok := r.Headers["Content-Type"]; ok {
 		contentType := strings.Join(contentTypes, ";")
@ -31,8 +47,7 @@ func ExtractTitle(r *Response) (title string) {
 		}

 		// Content-Type from head tag
-		re = regexp.MustCompile(`(?im)\s*charset="(.*?)"|charset=(.*?)"\s*`)
-		var match = re.FindSubmatch(r.Data)
+		var match = reContentType.FindSubmatch(r.Data)
 		var mcontentType = ""
 		if len(match) != 0 {
 			for i, v := range match {
@ -55,6 +70,36 @@ func ExtractTitle(r *Response) (title string) {
 	return //nolint
 }

+func getTitleWithDom(r *Response) (*html.Node, error) {
+	var title *html.Node
+	var crawler func(*html.Node)
+	crawler = func(node *html.Node) {
+		if node.Type == html.ElementNode && node.Data == "title" {
+			title = node
+			return
+		}
+		for child := node.FirstChild; child != nil; child = child.NextSibling {
+			crawler(child)
+		}
+	}
+	htmlDoc, err := html.Parse(bytes.NewReader(r.Data))
+	if err != nil {
+		return nil, err
+	}
+	crawler(htmlDoc)
+	if title != nil {
+		return title, nil
+	}
+	return nil, fmt.Errorf("Title not found")
+}
+
+func renderNode(n *html.Node) string {
+	var buf bytes.Buffer
+	w := io.Writer(&buf)
+	html.Render(w, n)
+	return buf.String()
+}
+
 func trimTitleTags(title string) string {
 	// trim <title>*</title>
 	titleBegin := strings.Index(title, ">")