Replace lxml.HTMLParser with BeautifulSoup parser.

lxml.HTMLParser is having troubled with emoji content https://github.com/Orange-OpenSource/hurl/issues/959.
Besides, BeautifulSoup is already a dependance for our tests suite.
This commit is contained in:
jcamiel 2022-11-05 15:42:10 +01:00
parent b38d108eb2
commit 1c796d8f0c
No known key found for this signature in database
GPG Key ID: 07FF11CFD55356CC
5 changed files with 47 additions and 28 deletions

View File

@ -2,15 +2,15 @@
# Check that issues in CHANGELOG are up-to-to-date
set -eu
#version=$(head -1 <CHANGELOG.md| cut -d" " -f1 | cut -d'[' -f2)
#changelog=$(bin/release/changelog_extract.py "$version" <CHANGELOG.md| grep '^ \* ')
#issues=$(bin/release/get_release_note.py "$version" 2>/dev/null | grep '^ \* ')
#
#if [ "$changelog" != "$issues" ]; then
# echo "Diff in issues in CHANGELOG"
# diff <(echo "$changelog") <(echo "$issues")
# exit 1
#fi
version=$(head -1 <CHANGELOG.md| cut -d" " -f1 | cut -d'[' -f2)
changelog=$(bin/release/changelog_extract.py "$version" <CHANGELOG.md| grep '^\* ')
issues=$(bin/release/get_release_note.py "$version" 2>/dev/null | grep '^\* ')
if [ "$changelog" != "$issues" ]; then
echo "Diff in issues in CHANGELOG"
diff <(echo "$changelog") <(echo "$issues")
exit 1
fi

View File

@ -8,11 +8,10 @@ Example:
import datetime
import json
import sys
from io import StringIO
from typing import List
import requests
from lxml import etree
from bs4 import BeautifulSoup
hurl_repo_url = "https://github.com/Orange-OpenSource/hurl"
@ -135,15 +134,14 @@ def get_linked_pulls(issue_number) -> List[Pull]:
def webscrapping_linked_pulls(html) -> List[Pull]:
parser = etree.HTMLParser()
tree = etree.parse(StringIO(html), parser)
links = tree.xpath("//development-menu//a")
soup = BeautifulSoup(html, "html.parser")
links = soup.select("development-menu a")
pulls = []
for link in links:
url = link.attrib["href"]
url = link["href"]
if url == "/Orange-OpenSource/hurl":
continue
description = "".join(link.itertext()).strip()
description = "".join(link.getText()).strip()
pull = Pull(url, description)
pulls.append(pull)
return pulls

View File

@ -153,6 +153,22 @@ ISSUE_HTML = """<development-menu data-catalyst="">
"""
ISSUE_WITH_EMOJI_HTML = """
<html>
<head>
<meta charset="UTF-8">
<meta name="description" content="👋 hello">
<title>Issue</title>
</head>
<body>
<development-menu>
<a href="/Orange-OpenSource/hurl/pull/958">Issue 958</a>
</development-menu>
</body>
</html>
"""
class GetReleaseNoteTest(unittest.TestCase):
def test_authors_from_issues(self):
self.assertEqual(["bob", "bill"], authors_from_issues(ISSUES))
@ -166,6 +182,12 @@ class GetReleaseNoteTest(unittest.TestCase):
webscrapping_linked_pulls(ISSUE_HTML),
)
def test_webscrapping_issue_with_emoji(self):
self.assertEqual(
[Pull("/Orange-OpenSource/hurl/pull/958", "Issue 958", [], [])],
webscrapping_linked_pulls(ISSUE_WITH_EMOJI_HTML),
)
def test_generate_md(self):
self.assertEqual(
"""[1.0.0 (2022-01-01)](https://github.com/Orange-OpenSource/hurl/blob/master/CHANGELOG.md#1.0.0)
@ -178,16 +200,16 @@ Thanks to
Enhancements:
* pull1 [#1](https://github.com/Orange-OpenSource/hurl/issues/1)
* pull1 [#1](https://github.com/Orange-OpenSource/hurl/issues/1)
* pull4 [#3](https://github.com/Orange-OpenSource/hurl/issues/3) [#4](https://github.com/Orange-OpenSource/hurl/issues/4)
* pull4 [#3](https://github.com/Orange-OpenSource/hurl/issues/3) [#4](https://github.com/Orange-OpenSource/hurl/issues/4)
Bugs Fixed:
* pull2 [#2](https://github.com/Orange-OpenSource/hurl/issues/2)
* pull2 [#2](https://github.com/Orange-OpenSource/hurl/issues/2)
* pull3 [#2](https://github.com/Orange-OpenSource/hurl/issues/2)
* pull3 [#2](https://github.com/Orange-OpenSource/hurl/issues/2)
""",
generate_md(
milestone="1.0.0",

View File

@ -3,8 +3,7 @@ beautifulsoup4==4.11.1
black==22.6.0
blinker==1.4
Brotli==1.0.9
bs4==0.0.1
certifi==2022.6.15
certifi==2022.9.24
cffi==1.15.1
charset-normalizer==2.1.1
click==8.1.3
@ -14,7 +13,7 @@ h11==0.13.0
h2==4.1.0
hpack==4.0.0
hyperframe==6.0.1
idna==3.3
idna==3.4
itsdangerous==2.1.2
Jinja2==3.1.2
kaitaistruct==0.9
@ -26,9 +25,9 @@ msgpack==1.0.4
mypy==0.971
mypy-extensions==0.4.3
passlib==1.7.4
pathspec==0.9.0
pathspec==0.10.1
platformdirs==2.5.2
protobuf==3.19.5
protobuf==3.19.6
publicsuffix2==2.20191221
pyasn1==0.4.8
pycparser==2.21
@ -37,12 +36,12 @@ pyparsing==3.0.9
pyperclip==1.8.2
requests==2.28.1
ruamel.yaml==0.17.21
ruamel.yaml.clib==0.2.6
ruamel.yaml.clib==0.2.7
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
tomli==2.0.1
tornado==6.2
typing_extensions==4.3.0
typing_extensions==4.4.0
urllib3==1.26.12
urwid==2.1.2
Werkzeug==2.0.3

View File

@ -1,5 +1,5 @@
black==22.6.0
bs4==0.0.1
beautifulsoup4==4.11.1
Flask==2.0.3
lxml==4.9.1
mitmproxy==8.0.0