2021-11-29 15:58:18 +03:00
|
|
|
#!/usr/bin/env python3
|
2021-09-23 17:26:04 +03:00
|
|
|
# Extract hurl file from html output
|
|
|
|
import sys
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import os
|
|
|
|
import codecs
|
|
|
|
|
2022-02-05 08:56:33 +03:00
|
|
|
|
2021-09-23 17:26:04 +03:00
|
|
|
def test(html_file):
|
|
|
|
print(html_file)
|
|
|
|
actual = extract_hurl_content(html_file)
|
|
|
|
|
2022-02-05 08:56:33 +03:00
|
|
|
hurl_file = os.path.splitext(html_file)[0] + ".hurl"
|
2021-09-23 17:26:04 +03:00
|
|
|
if not os.path.isfile(hurl_file):
|
|
|
|
return
|
2022-02-05 08:56:33 +03:00
|
|
|
expected = codecs.open(
|
|
|
|
hurl_file, encoding="utf-8-sig"
|
|
|
|
).read() # Input file can be saved with a BOM
|
2021-09-23 17:26:04 +03:00
|
|
|
if actual != expected:
|
2022-02-05 08:56:33 +03:00
|
|
|
print(">>> error in html file")
|
|
|
|
print(f"actual: <{actual}>\nexpected: <{expected}>")
|
2021-09-23 17:26:04 +03:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
def extract_hurl_content(hurl_file):
|
|
|
|
s = open(hurl_file).read()
|
|
|
|
return BeautifulSoup(s, "lxml").text
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2022-02-05 08:56:33 +03:00
|
|
|
print("** test html output")
|
2021-09-23 17:26:04 +03:00
|
|
|
for html_file in sys.argv[1:]:
|
|
|
|
test(html_file)
|
|
|
|
|
|
|
|
|
2022-02-05 08:56:33 +03:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|