2012-03-22 02:19:27 +04:00
|
|
|
|
"""
|
|
|
|
|
weasyprint.html
|
|
|
|
|
---------------
|
2011-05-25 17:54:46 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
Specific handling for some HTML elements, especially replaced elements.
|
2011-05-25 17:54:46 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
Replaced elements (eg. <img> elements) are rendered externally and
|
|
|
|
|
behave as an atomic opaque box in CSS. In general, they may or may not
|
|
|
|
|
have intrinsic dimensions. But the only replaced elements currently
|
|
|
|
|
supported in WeasyPrint are images with intrinsic dimensions.
|
2011-08-19 18:52:46 +04:00
|
|
|
|
|
2019-03-04 13:04:06 +03:00
|
|
|
|
:copyright: Copyright 2011-2019 Simon Sapin and contributors, see AUTHORS.
|
2012-03-22 02:19:27 +04:00
|
|
|
|
:license: BSD, see LICENSE for details.
|
2011-08-19 18:52:46 +04:00
|
|
|
|
|
2011-05-25 17:54:46 +04:00
|
|
|
|
"""
|
|
|
|
|
|
2012-03-25 03:39:41 +04:00
|
|
|
|
import logging
|
2013-07-14 15:08:02 +04:00
|
|
|
|
import re
|
2018-01-14 03:48:17 +03:00
|
|
|
|
from urllib.parse import urljoin
|
2011-08-10 16:51:18 +04:00
|
|
|
|
|
2018-08-23 19:20:36 +03:00
|
|
|
|
from . import CSS, ROOT
|
2013-07-14 15:08:02 +04:00
|
|
|
|
from .css import get_child_text
|
2011-10-14 21:07:13 +04:00
|
|
|
|
from .formatting_structure import boxes
|
2012-03-25 03:39:41 +04:00
|
|
|
|
from .logger import LOGGER
|
2017-03-25 02:33:36 +03:00
|
|
|
|
from .urls import get_url_attribute
|
2012-03-25 03:39:41 +04:00
|
|
|
|
|
|
|
|
|
# XXX temporarily disable logging for user-agent stylesheet
|
|
|
|
|
level = LOGGER.level
|
|
|
|
|
LOGGER.setLevel(logging.ERROR)
|
|
|
|
|
|
2019-02-06 20:38:16 +03:00
|
|
|
|
HTML5_UA_STYLESHEET = CSS(filename=(ROOT / 'css' / 'html5_ua.css'))
|
|
|
|
|
HTML5_PH_STYLESHEET = CSS(filename=(ROOT / 'css' / 'html5_ph.css'))
|
2012-03-25 03:39:41 +04:00
|
|
|
|
|
|
|
|
|
LOGGER.setLevel(level)
|
2011-08-10 16:51:18 +04:00
|
|
|
|
|
2011-05-25 17:54:46 +04:00
|
|
|
|
|
2014-04-22 04:34:47 +04:00
|
|
|
|
# http://whatwg.org/C#space-character
|
|
|
|
|
HTML_WHITESPACE = ' \t\n\f\r'
|
|
|
|
|
HTML_SPACE_SEPARATED_TOKENS_RE = re.compile('[^%s]+' % HTML_WHITESPACE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ascii_lower(string):
|
|
|
|
|
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
|
|
|
|
|
|
|
|
|
|
:param string: An Unicode string.
|
|
|
|
|
:returns: A new Unicode string.
|
|
|
|
|
|
|
|
|
|
This is used for `ASCII case-insensitive
|
|
|
|
|
<http://whatwg.org/C#ascii-case-insensitive>`_ matching.
|
|
|
|
|
|
|
|
|
|
This is different from the :meth:`~py:str.lower` method of Unicode strings
|
|
|
|
|
which also affect non-ASCII characters,
|
|
|
|
|
sometimes mapping them into the ASCII range:
|
|
|
|
|
|
|
|
|
|
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
|
|
|
|
|
>>> assert keyword.lower() == u'background'
|
|
|
|
|
>>> assert ascii_lower(keyword) != keyword.lower()
|
|
|
|
|
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# This turns out to be faster than unicode.translate()
|
|
|
|
|
return string.encode('utf8').lower().decode('utf8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def element_has_link_type(element, link_type):
|
|
|
|
|
"""
|
|
|
|
|
Return whether the given element has a ``rel`` attribute with the
|
|
|
|
|
given link type.
|
|
|
|
|
|
|
|
|
|
:param link_type: Must be a lower-case string.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
return any(ascii_lower(token) == link_type for token in
|
|
|
|
|
HTML_SPACE_SEPARATED_TOKENS_RE.findall(element.get('rel', '')))
|
|
|
|
|
|
|
|
|
|
|
2011-08-25 14:48:00 +04:00
|
|
|
|
# Maps HTML tag names to function taking an HTML element and returning a Box.
|
|
|
|
|
HTML_HANDLERS = {}
|
2011-08-20 20:02:04 +04:00
|
|
|
|
|
2011-08-25 19:29:16 +04:00
|
|
|
|
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_element(element, box, get_image_from_uri, base_url):
|
2011-08-25 19:29:16 +04:00
|
|
|
|
"""Handle HTML elements that need special care.
|
|
|
|
|
|
2011-11-17 18:39:30 +04:00
|
|
|
|
:returns: a (possibly empty) list of boxes.
|
2011-08-25 19:29:16 +04:00
|
|
|
|
"""
|
2011-12-02 15:36:20 +04:00
|
|
|
|
if box.element_tag in HTML_HANDLERS:
|
2017-07-03 16:19:05 +03:00
|
|
|
|
return HTML_HANDLERS[element.tag](
|
|
|
|
|
element, box, get_image_from_uri, base_url)
|
2011-08-25 19:29:16 +04:00
|
|
|
|
else:
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return [box]
|
2011-08-20 20:02:04 +04:00
|
|
|
|
|
|
|
|
|
|
2011-08-25 14:48:00 +04:00
|
|
|
|
def handler(tag):
|
2011-09-09 01:02:17 +04:00
|
|
|
|
"""Return a decorator registering a function handling ``tag`` elements."""
|
2011-08-22 19:50:32 +04:00
|
|
|
|
def decorator(function):
|
2011-09-09 01:02:17 +04:00
|
|
|
|
"""Decorator registering a function handling ``tag`` elements."""
|
2011-08-25 14:48:00 +04:00
|
|
|
|
HTML_HANDLERS[tag] = function
|
2011-08-22 19:50:32 +04:00
|
|
|
|
return function
|
|
|
|
|
return decorator
|
2011-08-20 20:02:04 +04:00
|
|
|
|
|
|
|
|
|
|
2011-12-08 21:11:32 +04:00
|
|
|
|
def make_replaced_box(element, box, image):
|
|
|
|
|
"""Wrap an image in a replaced box.
|
2011-09-09 01:02:17 +04:00
|
|
|
|
|
|
|
|
|
That box is either block-level or inline-level, depending on what the
|
|
|
|
|
element should be.
|
|
|
|
|
|
2011-08-25 19:29:16 +04:00
|
|
|
|
"""
|
2018-01-13 19:05:23 +03:00
|
|
|
|
if box.style['display'] in ('block', 'list-item', 'table'):
|
2011-12-05 17:24:43 +04:00
|
|
|
|
type_ = boxes.BlockReplacedBox
|
2011-08-25 14:48:00 +04:00
|
|
|
|
else:
|
2012-03-14 16:34:08 +04:00
|
|
|
|
# TODO: support images with 'display: table-cell'?
|
2011-12-05 17:24:43 +04:00
|
|
|
|
type_ = boxes.InlineReplacedBox
|
2019-10-18 00:39:20 +03:00
|
|
|
|
new_box = type_(element.tag, box.style, element, image)
|
2018-11-02 16:32:14 +03:00
|
|
|
|
# TODO: check other attributes that need to be copied
|
|
|
|
|
# TODO: find another solution
|
|
|
|
|
new_box.string_set = box.string_set
|
|
|
|
|
new_box.bookmark_label = box.bookmark_label
|
|
|
|
|
return new_box
|
2011-08-25 14:48:00 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@handler('img')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_img(element, box, get_image_from_uri, base_url):
|
2012-02-29 20:38:30 +04:00
|
|
|
|
"""Handle ``<img>`` elements, return either an image or the alt-text.
|
2011-09-09 01:02:17 +04:00
|
|
|
|
|
|
|
|
|
See: http://www.w3.org/TR/html5/embedded-content-1.html#the-img-element
|
2011-08-25 19:29:16 +04:00
|
|
|
|
|
2011-08-22 19:55:30 +04:00
|
|
|
|
"""
|
2017-07-03 16:19:05 +03:00
|
|
|
|
src = get_url_attribute(element, 'src', base_url)
|
2011-12-02 15:36:20 +04:00
|
|
|
|
alt = element.get('alt')
|
2011-08-25 19:29:16 +04:00
|
|
|
|
if src:
|
2012-07-12 17:54:22 +04:00
|
|
|
|
image = get_image_from_uri(src)
|
2011-12-08 21:11:32 +04:00
|
|
|
|
if image is not None:
|
|
|
|
|
return [make_replaced_box(element, box, image)]
|
2011-08-26 00:16:04 +04:00
|
|
|
|
else:
|
2011-08-25 19:29:16 +04:00
|
|
|
|
# Invalid image, use the alt-text.
|
|
|
|
|
if alt:
|
2016-11-01 06:31:15 +03:00
|
|
|
|
box.children = [boxes.TextBox.anonymous_from(box, alt)]
|
|
|
|
|
return [box]
|
2011-08-25 19:29:16 +04:00
|
|
|
|
elif alt == '':
|
|
|
|
|
# The element represents nothing
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return []
|
2011-08-25 19:29:16 +04:00
|
|
|
|
else:
|
|
|
|
|
assert alt is None
|
|
|
|
|
# TODO: find some indicator that an image is missing.
|
|
|
|
|
# For now, just remove the image.
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return []
|
2011-08-25 19:29:16 +04:00
|
|
|
|
else:
|
|
|
|
|
if alt:
|
2016-11-01 06:31:15 +03:00
|
|
|
|
box.children = [boxes.TextBox.anonymous_from(box, alt)]
|
|
|
|
|
return [box]
|
2011-08-25 19:29:16 +04:00
|
|
|
|
else:
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return []
|
2011-05-25 17:54:46 +04:00
|
|
|
|
|
|
|
|
|
|
2012-02-29 20:38:30 +04:00
|
|
|
|
@handler('embed')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_embed(element, box, get_image_from_uri, base_url):
|
2012-02-29 20:38:30 +04:00
|
|
|
|
"""Handle ``<embed>`` elements, return either an image or nothing.
|
|
|
|
|
|
2016-08-04 20:20:37 +03:00
|
|
|
|
See: https://www.w3.org/TR/html5/embedded-content-0.html#the-embed-element
|
2012-02-29 20:38:30 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2017-07-03 16:19:05 +03:00
|
|
|
|
src = get_url_attribute(element, 'src', base_url)
|
2012-02-29 20:38:30 +04:00
|
|
|
|
type_ = element.get('type', '').strip()
|
|
|
|
|
if src:
|
2012-07-12 17:54:22 +04:00
|
|
|
|
image = get_image_from_uri(src, type_)
|
2012-02-29 20:38:30 +04:00
|
|
|
|
if image is not None:
|
|
|
|
|
return [make_replaced_box(element, box, image)]
|
|
|
|
|
# No fallback.
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@handler('object')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_object(element, box, get_image_from_uri, base_url):
|
2012-02-29 20:38:30 +04:00
|
|
|
|
"""Handle ``<object>`` elements, return either an image or the fallback
|
|
|
|
|
content.
|
|
|
|
|
|
2016-08-04 20:20:37 +03:00
|
|
|
|
See: https://www.w3.org/TR/html5/embedded-content-0.html#the-object-element
|
2012-02-29 20:38:30 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2017-07-03 16:19:05 +03:00
|
|
|
|
data = get_url_attribute(element, 'data', base_url)
|
2012-02-29 20:38:30 +04:00
|
|
|
|
type_ = element.get('type', '').strip()
|
|
|
|
|
if data:
|
2012-07-12 17:54:22 +04:00
|
|
|
|
image = get_image_from_uri(data, type_)
|
2012-02-29 20:38:30 +04:00
|
|
|
|
if image is not None:
|
|
|
|
|
return [make_replaced_box(element, box, image)]
|
|
|
|
|
# The element’s children are the fallback.
|
|
|
|
|
return [box]
|
|
|
|
|
|
|
|
|
|
|
2011-12-02 15:36:20 +04:00
|
|
|
|
def integer_attribute(element, box, name, minimum=1):
|
2011-11-15 14:44:28 +04:00
|
|
|
|
"""Read an integer attribute from the HTML element and set it on the box.
|
|
|
|
|
|
|
|
|
|
"""
|
2011-12-02 15:36:20 +04:00
|
|
|
|
value = element.get(name, '').strip()
|
2012-02-29 20:38:30 +04:00
|
|
|
|
if value:
|
|
|
|
|
try:
|
|
|
|
|
value = int(value)
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
if value >= minimum:
|
|
|
|
|
setattr(box, name, value)
|
2011-11-15 14:44:28 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@handler('colgroup')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_colgroup(element, box, _get_image_from_uri, _base_url):
|
2011-11-15 14:44:28 +04:00
|
|
|
|
"""Handle the ``span`` attribute."""
|
|
|
|
|
if isinstance(box, boxes.TableColumnGroupBox):
|
2011-12-02 15:36:20 +04:00
|
|
|
|
if any(child.tag == 'col' for child in element):
|
2011-11-15 14:44:28 +04:00
|
|
|
|
box.span = None # sum of the children’s spans
|
|
|
|
|
else:
|
2011-12-02 15:36:20 +04:00
|
|
|
|
integer_attribute(element, box, 'span')
|
2012-02-29 20:38:30 +04:00
|
|
|
|
box.children = (
|
|
|
|
|
boxes.TableColumnBox.anonymous_from(box, [])
|
2018-01-14 03:48:17 +03:00
|
|
|
|
for _i in range(box.span))
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return [box]
|
2011-11-15 14:44:28 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@handler('col')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_col(element, box, _get_image_from_uri, _base_url):
|
2011-11-15 14:44:28 +04:00
|
|
|
|
"""Handle the ``span`` attribute."""
|
|
|
|
|
if isinstance(box, boxes.TableColumnBox):
|
2011-12-02 15:36:20 +04:00
|
|
|
|
integer_attribute(element, box, 'span')
|
2011-11-17 18:39:30 +04:00
|
|
|
|
if box.span > 1:
|
|
|
|
|
# Generate multiple boxes
|
|
|
|
|
# http://lists.w3.org/Archives/Public/www-style/2011Nov/0293.html
|
2018-01-14 03:48:17 +03:00
|
|
|
|
return [box.copy() for _i in range(box.span)]
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return [box]
|
2011-11-15 14:44:28 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@handler('th')
|
|
|
|
|
@handler('td')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_td(element, box, _get_image_from_uri, _base_url):
|
2011-11-15 14:44:28 +04:00
|
|
|
|
"""Handle the ``colspan``, ``rowspan`` attributes."""
|
|
|
|
|
if isinstance(box, boxes.TableCellBox):
|
|
|
|
|
# HTML 4.01 gives special meaning to colspan=0
|
|
|
|
|
# http://www.w3.org/TR/html401/struct/tables.html#adef-rowspan
|
|
|
|
|
# but HTML 5 removed it
|
|
|
|
|
# http://www.w3.org/TR/html5/tabular-data.html#attr-tdth-colspan
|
|
|
|
|
# rowspan=0 is still there though.
|
2011-12-02 15:36:20 +04:00
|
|
|
|
integer_attribute(element, box, 'colspan')
|
|
|
|
|
integer_attribute(element, box, 'rowspan', minimum=0)
|
2011-11-17 18:39:30 +04:00
|
|
|
|
return [box]
|
2012-08-03 17:20:22 +04:00
|
|
|
|
|
|
|
|
|
|
2014-04-04 20:46:00 +04:00
|
|
|
|
@handler('a')
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def handle_a(element, box, _get_image_from_uri, base_url):
|
2014-04-04 20:46:00 +04:00
|
|
|
|
"""Handle the ``rel`` attribute."""
|
2014-04-23 19:10:31 +04:00
|
|
|
|
box.is_attachment = element_has_link_type(element, 'attachment')
|
2014-04-04 20:46:00 +04:00
|
|
|
|
return [box]
|
|
|
|
|
|
|
|
|
|
|
2012-08-03 17:20:22 +04:00
|
|
|
|
def find_base_url(html_document, fallback_base_url):
|
|
|
|
|
"""Return the base URL for the document.
|
|
|
|
|
|
|
|
|
|
See http://www.w3.org/TR/html5/urls.html#document-base-url
|
|
|
|
|
|
|
|
|
|
"""
|
2012-08-17 19:37:33 +04:00
|
|
|
|
first_base_element = next(iter(html_document.iter('base')), None)
|
2012-08-03 17:20:22 +04:00
|
|
|
|
if first_base_element is not None:
|
|
|
|
|
href = first_base_element.get('href', '').strip()
|
|
|
|
|
if href:
|
|
|
|
|
return urljoin(fallback_base_url, href)
|
|
|
|
|
return fallback_base_url
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
|
2017-07-07 12:14:07 +03:00
|
|
|
|
def get_html_metadata(wrapper_element, base_url):
|
2013-07-14 15:08:02 +04:00
|
|
|
|
"""
|
|
|
|
|
Relevant specs:
|
|
|
|
|
|
|
|
|
|
http://www.whatwg.org/html#the-title-element
|
|
|
|
|
http://www.whatwg.org/html#standard-metadata-names
|
|
|
|
|
http://wiki.whatwg.org/wiki/MetaExtensions
|
2014-04-04 14:32:21 +04:00
|
|
|
|
http://microformats.org/wiki/existing-rel-values#HTML5_link_type_extensions
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
title = None
|
|
|
|
|
description = None
|
|
|
|
|
generator = None
|
|
|
|
|
keywords = []
|
|
|
|
|
authors = []
|
|
|
|
|
created = None
|
|
|
|
|
modified = None
|
2014-04-04 14:32:21 +04:00
|
|
|
|
attachments = []
|
2017-07-03 16:19:05 +03:00
|
|
|
|
for element in wrapper_element.query_all('title', 'meta', 'link'):
|
2017-07-03 16:31:17 +03:00
|
|
|
|
element = element.etree_element
|
2013-07-14 15:08:02 +04:00
|
|
|
|
if element.tag == 'title' and title is None:
|
|
|
|
|
title = get_child_text(element)
|
|
|
|
|
elif element.tag == 'meta':
|
|
|
|
|
name = ascii_lower(element.get('name', ''))
|
|
|
|
|
content = element.get('content', '')
|
|
|
|
|
if name == 'keywords':
|
|
|
|
|
for keyword in map(strip_whitespace, content.split(',')):
|
|
|
|
|
if keyword not in keywords:
|
|
|
|
|
keywords.append(keyword)
|
|
|
|
|
elif name == 'author':
|
|
|
|
|
authors.append(content)
|
|
|
|
|
elif name == 'description' and description is None:
|
|
|
|
|
description = content
|
|
|
|
|
elif name == 'generator' and generator is None:
|
|
|
|
|
generator = content
|
|
|
|
|
elif name == 'dcterms.created' and created is None:
|
2017-07-01 01:28:14 +03:00
|
|
|
|
created = parse_w3c_date(name, content)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
elif name == 'dcterms.modified' and modified is None:
|
2017-07-01 01:28:14 +03:00
|
|
|
|
modified = parse_w3c_date(name, content)
|
2014-04-27 21:16:14 +04:00
|
|
|
|
elif element.tag == 'link' and element_has_link_type(
|
|
|
|
|
element, 'attachment'):
|
2017-07-07 12:14:07 +03:00
|
|
|
|
url = get_url_attribute(element, 'href', base_url)
|
2019-08-17 12:04:57 +03:00
|
|
|
|
attachment_title = element.get('title', None)
|
2014-04-23 19:42:37 +04:00
|
|
|
|
if url is None:
|
2017-07-25 14:59:56 +03:00
|
|
|
|
LOGGER.error('Missing href in <link rel="attachment">')
|
2014-04-23 19:42:37 +04:00
|
|
|
|
else:
|
2019-08-17 12:04:57 +03:00
|
|
|
|
attachments.append((url, attachment_title))
|
2013-07-14 15:08:02 +04:00
|
|
|
|
return dict(title=title, description=description, generator=generator,
|
|
|
|
|
keywords=keywords, authors=authors,
|
2014-04-04 14:32:21 +04:00
|
|
|
|
created=created, modified=modified,
|
|
|
|
|
attachments=attachments)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def strip_whitespace(string):
|
|
|
|
|
"""Use the HTML definition of "space character",
|
|
|
|
|
not all Unicode Whitespace.
|
|
|
|
|
|
|
|
|
|
http://www.whatwg.org/html#strip-leading-and-trailing-whitespace
|
|
|
|
|
http://www.whatwg.org/html#space-character
|
|
|
|
|
|
|
|
|
|
"""
|
2017-10-21 22:11:03 +03:00
|
|
|
|
return string.strip(HTML_WHITESPACE)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# YYYY (eg 1997)
|
|
|
|
|
# YYYY-MM (eg 1997-07)
|
|
|
|
|
# YYYY-MM-DD (eg 1997-07-16)
|
|
|
|
|
# YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
|
|
|
|
|
# YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
|
|
|
|
|
# YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
|
|
|
|
|
|
|
|
|
|
W3C_DATE_RE = re.compile('''
|
|
|
|
|
^
|
|
|
|
|
[ \t\n\f\r]*
|
2018-10-26 16:18:49 +03:00
|
|
|
|
(?P<year>\\d\\d\\d\\d)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
(?:
|
2018-10-26 16:18:49 +03:00
|
|
|
|
-(?P<month>0\\d|1[012])
|
2013-07-14 15:08:02 +04:00
|
|
|
|
(?:
|
2018-10-26 16:18:49 +03:00
|
|
|
|
-(?P<day>[012]\\d|3[01])
|
2013-07-14 15:08:02 +04:00
|
|
|
|
(?:
|
2018-10-26 16:18:49 +03:00
|
|
|
|
T(?P<hour>[01]\\d|2[0-3])
|
|
|
|
|
:(?P<minute>[0-5]\\d)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
(?:
|
2018-10-26 16:18:49 +03:00
|
|
|
|
:(?P<second>[0-5]\\d)
|
|
|
|
|
(?:\\.\\d+)? # Second fraction, ignored
|
2013-07-14 15:08:02 +04:00
|
|
|
|
)?
|
|
|
|
|
(?:
|
|
|
|
|
Z | # UTC
|
2018-10-26 16:18:49 +03:00
|
|
|
|
(?P<tz_hour>[+-](?:[01]\\d|2[0-3]))
|
|
|
|
|
:(?P<tz_minute>[0-5]\\d)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
)
|
|
|
|
|
)?
|
|
|
|
|
)?
|
|
|
|
|
)?
|
|
|
|
|
[ \t\n\f\r]*
|
|
|
|
|
$
|
|
|
|
|
''', re.VERBOSE)
|
|
|
|
|
|
|
|
|
|
|
2017-07-01 01:28:14 +03:00
|
|
|
|
def parse_w3c_date(meta_name, string):
|
2013-07-14 15:08:02 +04:00
|
|
|
|
"""http://www.w3.org/TR/NOTE-datetime"""
|
|
|
|
|
if W3C_DATE_RE.match(string):
|
|
|
|
|
return string
|
|
|
|
|
else:
|
2017-07-01 01:28:14 +03:00
|
|
|
|
LOGGER.warning(
|
|
|
|
|
'Invalid date in <meta name="%s"> %r', meta_name, string)
|