mirror of
https://github.com/serokell/xrefcheck.git
synced 2024-10-26 15:35:29 +03:00
[#95] Support HTML tag parsing compatible with HTML spec
Problem: We had hardcoded HTML tag parser, that doesn't work with add valid HTML tags Solution: Replace it with `tagsoup` library, that care about all parsing stuff
This commit is contained in:
parent
e4af3c7310
commit
a6b4513587
@ -101,6 +101,7 @@ library:
|
|||||||
- regex-tdfa
|
- regex-tdfa
|
||||||
- req
|
- req
|
||||||
- roman-numerals
|
- roman-numerals
|
||||||
|
- tagsoup
|
||||||
- text
|
- text
|
||||||
- text-metrics
|
- text-metrics
|
||||||
- th-lift-instances
|
- th-lift-instances
|
||||||
|
@ -26,6 +26,7 @@ import Data.Default (def)
|
|||||||
import Data.Text qualified as T
|
import Data.Text qualified as T
|
||||||
import Data.Text.Lazy qualified as LT
|
import Data.Text.Lazy qualified as LT
|
||||||
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
|
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
|
||||||
|
import Text.HTML.TagSoup
|
||||||
|
|
||||||
import Xrefcheck.Core
|
import Xrefcheck.Core
|
||||||
import Xrefcheck.Scan
|
import Xrefcheck.Scan
|
||||||
@ -146,7 +147,16 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
|
|||||||
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
|
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
|
||||||
|
|
||||||
HTML_INLINE text -> do
|
HTML_INLINE text -> do
|
||||||
let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" text
|
let
|
||||||
|
mName = do
|
||||||
|
tag <- safeHead $ parseTags text
|
||||||
|
attributes <- case tag of
|
||||||
|
TagOpen a attrs
|
||||||
|
| T.toLower a == "a" -> Just attrs
|
||||||
|
_ -> Nothing
|
||||||
|
(_, name) <- find (\(field, _) -> T.toLower field == "name") attributes
|
||||||
|
pure name
|
||||||
|
|
||||||
case mName of
|
case mName of
|
||||||
Just aName -> do
|
Just aName -> do
|
||||||
let aType = HandAnchor
|
let aType = HandAnchor
|
||||||
|
13
tests/golden/check-html/check-html.md
Normal file
13
tests/golden/check-html/check-html.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
<!--
|
||||||
|
- SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
|
||||||
|
-
|
||||||
|
- SPDX-License-Identifier: MPL-2.0
|
||||||
|
-->
|
||||||
|
|
||||||
|
## <a name='one'> <a name=two> <a NAME="three"> <a name="four"></a> <a NAME=five > Title1
|
||||||
|
|
||||||
|
[One](#one)
|
||||||
|
[Two](#two)
|
||||||
|
[Three](#three)
|
||||||
|
[Four](#four)
|
||||||
|
[Five](#five)
|
16
tests/golden/check-html/check.html.bats
Normal file
16
tests/golden/check-html/check.html.bats
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
# SPDX-FileCopyrightText: 2022 Serokell <https://serokell.io>
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
load '../helpers/bats-support/load'
|
||||||
|
load '../helpers/bats-assert/load'
|
||||||
|
load '../helpers'
|
||||||
|
|
||||||
|
|
||||||
|
@test "All HTML anchors should be valid" {
|
||||||
|
run xrefcheck
|
||||||
|
|
||||||
|
assert_output --partial "All repository links are valid."
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user