[#95] Support HTML tag parsing compatible with HTML spec

Problem: We had hardcoded HTML tag parser, that doesn't work with add valid HTML tags

Solution: Replace it with `tagsoup` library, that care about all parsing stuff
This commit is contained in:
Andrei Borzenkov 2022-07-13 15:26:37 +04:00
parent e4af3c7310
commit a6b4513587
4 changed files with 41 additions and 1 deletions

View File

@ -101,6 +101,7 @@ library:
- regex-tdfa
- req
- roman-numerals
- tagsoup
- text
- text-metrics
- th-lift-instances

View File

@ -26,6 +26,7 @@ import Data.Default (def)
import Data.Text qualified as T
import Data.Text.Lazy qualified as LT
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
import Text.HTML.TagSoup
import Xrefcheck.Core
import Xrefcheck.Scan
@ -146,7 +147,16 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
HTML_INLINE text -> do
let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" text
let
mName = do
tag <- safeHead $ parseTags text
attributes <- case tag of
TagOpen a attrs
| T.toLower a == "a" -> Just attrs
_ -> Nothing
(_, name) <- find (\(field, _) -> T.toLower field == "name") attributes
pure name
case mName of
Just aName -> do
let aType = HandAnchor

View File

@ -0,0 +1,13 @@
<!--
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->
## <a name='one'> <a name=two> <a NAME="three"> <a name="four"></a> <a NAME=five > Title1
[One](#one)
[Two](#two)
[Three](#three)
[Four](#four)
[Five](#five)

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bats
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
#
# SPDX-License-Identifier: MPL-2.0
load '../helpers/bats-support/load'
load '../helpers/bats-assert/load'
load '../helpers'
@test "All HTML anchors should be valid" {
run xrefcheck
assert_output --partial "All repository links are valid."
}