Adding support for euc-kr charset

This commit is contained in:
mzack 2022-02-02 07:18:57 +01:00
parent 24c20f2f45
commit eb3787518f
2 changed files with 18 additions and 6 deletions

View File

@ -4,6 +4,7 @@ import (
"bytes" "bytes"
"io/ioutil" "io/ioutil"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese" "golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/transform" "golang.org/x/text/transform"
@ -43,3 +44,8 @@ func Encodebig5(s []byte) ([]byte, error) {
} }
return d, nil return d, nil
} }
func DecodeKorean(s []byte) ([]byte, error) {
koreanDecoder := korean.EUCKR.NewDecoder()
return koreanDecoder.Bytes(s)
}

View File

@ -7,6 +7,7 @@ import (
"regexp" "regexp"
"strings" "strings"
"github.com/projectdiscovery/stringsutil"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
@ -39,16 +40,21 @@ func ExtractTitle(r *Response) (title string) {
// Non UTF-8 // Non UTF-8
if contentTypes, ok := r.Headers["Content-Type"]; ok { if contentTypes, ok := r.Headers["Content-Type"]; ok {
contentType := strings.Join(contentTypes, ";") contentType := strings.ToLower(strings.Join(contentTypes, ";"))
// special cases switch {
if strings.Contains(strings.ToLower(contentType), "charset=gb2312") || case stringsutil.ContainsAny(contentType, "charset=gb2312", "charset=gbk"):
strings.Contains(strings.ToLower(contentType), "charset=gbk") {
titleUtf8, err := Decodegbk([]byte(title)) titleUtf8, err := Decodegbk([]byte(title))
if err != nil { if err != nil {
return return
} }
return string(titleUtf8)
case stringsutil.ContainsAny(contentType, "euc-kr"):
titleUtf8, err := DecodeKorean([]byte(title))
if err != nil {
return
}
return string(titleUtf8) return string(titleUtf8)
} }
@ -63,12 +69,12 @@ func ExtractTitle(r *Response) (title string) {
} }
mcontentType = strings.ToLower(mcontentType) mcontentType = strings.ToLower(mcontentType)
} }
if strings.Contains(mcontentType, "gb2312") || strings.Contains(mcontentType, "gbk") { switch {
case stringsutil.ContainsAny(mcontentType, "gb2312", "gbk"):
titleUtf8, err := Decodegbk([]byte(title)) titleUtf8, err := Decodegbk([]byte(title))
if err != nil { if err != nil {
return return
} }
return string(titleUtf8) return string(titleUtf8)
} }
} }