#!/usr/bin/env python3 # Extract hurl file from html output import sys from bs4 import BeautifulSoup import os import codecs def test(html_file): print(html_file) actual = extract_hurl_content(html_file) hurl_file = os.path.splitext(html_file)[0] + ".hurl" if not os.path.isfile(hurl_file): return expected = codecs.open( hurl_file, encoding="utf-8-sig" ).read() # Input file can be saved with a BOM if actual.strip() != expected.strip(): print(">>> error in html file") print(f"actual: <{actual}>\nexpected: <{expected}>") sys.exit(1) def extract_hurl_content(hurl_file): s = open(hurl_file).read() return BeautifulSoup(s, "lxml").text def main(): print("** test html output") for html_file in sys.argv[1:]: test(html_file) if __name__ == "__main__": main()