1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 00:21:15 +03:00

Merge pull request #528 from Kozea/text

Fix the line breaking algorithm
This commit is contained in:
Guillaume Ayoub 2017-11-16 17:37:09 +01:00 committed by GitHub
commit 4e3cfb5b1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 154 additions and 63 deletions

View File

@ -337,7 +337,7 @@ def strip_whitespace(string):
http://www.whatwg.org/html#space-character
"""
return string.strip(' \t\n\f\r')
return string.strip(HTML_WHITESPACE)
# YYYY (eg 1997)

View File

@ -16,7 +16,7 @@ import unicodedata
from ..css.computed_values import ex_ratio, strut_layout
from ..formatting_structure import boxes
from ..text import split_first_line
from ..text import can_break_text, split_first_line
from .absolute import AbsolutePlaceholder, absolute_layout
from .float import avoid_collisions, float_layout
from .min_max import handle_min_max_height, handle_min_max_width
@ -101,7 +101,8 @@ def get_next_linebox(context, linebox, position_y, skip_stack,
line_fixed = []
waiting_floats = []
line, resume_at, preserved_line_break = split_inline_box(
(line, resume_at, preserved_line_break, first_letter,
last_letter) = split_inline_box(
context, linebox, position_x, max_x, skip_stack,
containing_block, device_size, line_absolutes,
line_fixed, line_placeholders, waiting_floats)
@ -226,7 +227,7 @@ def remove_last_whitespace(context, box):
if len(new_text) == len(box.text):
return
box.text = new_text
new_box, resume, _ = split_text_box(context, box, None, None, 0)
new_box, resume, _ = split_text_box(context, box, None, 0)
assert new_box is not None
assert resume is None
space_width = box.width - new_box.width
@ -546,9 +547,10 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
fixed_boxes, line_placeholders, waiting_floats):
"""Fit as much content as possible from an inline-level box in a width.
Return ``(new_box, resume_at)``. ``resume_at`` is ``None`` if all of the
content fits. Otherwise it can be passed as a ``skip_stack`` parameter
to resume where we left off.
Return ``(new_box, resume_at, preserved_line_break, first_letter,
last_letter)``. ``resume_at`` is ``None`` if all of the content
fits. Otherwise it can be passed as a ``skip_stack`` parameter to resume
where we left off.
``new_box`` is non-empty (unless the box is empty) and as big as possible
while being narrower than ``available_width``, if possible (may overflow
@ -566,18 +568,24 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
assert skip_stack is None
new_box, skip, preserved_line_break = split_text_box(
context, box, max_x - position_x, max_x, skip)
context, box, max_x - position_x, skip)
if skip is None:
resume_at = None
else:
resume_at = (skip, None)
if new_box and new_box.text:
first_letter = new_box.text[0]
last_letter = new_box.text[-1]
else:
first_letter = last_letter = None
elif isinstance(box, boxes.InlineBox):
if box.margin_left == 'auto':
box.margin_left = 0
if box.margin_right == 'auto':
box.margin_right = 0
new_box, resume_at, preserved_line_break = split_inline_box(
(new_box, resume_at, preserved_line_break, first_letter,
last_letter) = split_inline_box(
context, box, position_x, max_x, skip_stack, containing_block,
device_size, absolute_boxes, fixed_boxes, line_placeholders,
waiting_floats)
@ -588,8 +596,12 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
new_box.position_x = position_x
resume_at = None
preserved_line_break = False
# See https://www.w3.org/TR/css-text-3/#line-breaking
# Atomic inlines behave like ideographic characters.
first_letter = '\u2e80'
last_letter = '\u2e80'
# else: unexpected box type here
return new_box, resume_at, preserved_line_break
return new_box, resume_at, preserved_line_break, first_letter, last_letter
def split_inline_box(context, box, position_x, max_x, skip_stack,
@ -607,6 +619,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
is_start = skip_stack is None
initial_position_x = position_x
initial_skip_stack = skip_stack
assert isinstance(box, (boxes.LineBox, boxes.InlineBox))
left_spacing = (box.padding_left + box.margin_left +
box.border_left_width)
@ -617,7 +630,9 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
content_box_left = position_x
children = []
waiting_children = []
preserved_line_break = False
first_letter = last_letter = None
if box.style.position == 'relative':
absolute_boxes = []
@ -633,7 +648,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
child.position_x = position_x
placeholder = AbsolutePlaceholder(child)
line_placeholders.append(placeholder)
children.append(placeholder)
waiting_children.append((index, placeholder))
if child.style.position == 'absolute':
absolute_boxes.append(placeholder)
else:
@ -647,7 +662,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
# To retrieve the real available space for floats, we must remove
# the trailing whitespaces from the line
non_floating_children = [
child_ for child_ in children if not child_.is_floated()]
child_ for _, child_ in children if not child_.is_floated()]
if non_floating_children:
float_width -= trailing_whitespace_size(
context, non_floating_children[-1])
@ -660,9 +675,9 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
child = float_layout(
context, child, containing_block, device_size,
absolute_boxes, fixed_boxes)
children.append(child)
waiting_children.append((index, child))
# TODO: use the main text direction of the line
for old_child in children[:index]:
for _, old_child in children[:index]:
if not old_child.is_in_normal_flow():
continue
if child.style.float == 'left': # and direction is ltr
@ -675,7 +690,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
max_x -= max(child.margin_width(), 0)
continue
new_child, resume_at, preserved = split_inline_level(
new_child, resume_at, preserved, first, last = split_inline_level(
context, child, position_x, max_x, skip_stack, containing_block,
device_size, absolute_boxes, fixed_boxes, line_placeholders,
waiting_floats)
@ -683,6 +698,20 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
if preserved:
preserved_line_break = True
if None in (last_letter, first):
can_break = True
else:
can_break = can_break_text(
last_letter + first, child.style['lang'])
if can_break:
children.extend(waiting_children)
waiting_children = []
if first_letter is None:
first_letter = first
last_letter = last
# TODO: this is non-optimal when last_child is True and
# width <= remaining_width < width + right_spacing
# with
@ -697,23 +726,54 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
margin_width = new_child.margin_width()
new_position_x = position_x + margin_width
if (new_position_x > max_x and children):
# too wide, and the inline is non-empty:
# put child entirely on the next line.
resume_at = (index, None)
break
else:
position_x = new_position_x
children.append(new_child)
if new_position_x > max_x:
if children:
# too wide, and the inline is non-empty:
# put child entirely on the next line.
resume_at = (children[-1][0] + 1, None)
break
elif waiting_children:
# too wide, the inline is empty, we tried to add children
# but can't split the line between them: split the last
# child that can be split inside.
# TODO: we should take care of children added into
# absolute_boxes, fixed_boxes and other lists.
for index, child in reversed(waiting_children):
# TODO: what about relative children?
if (child.is_in_normal_flow() and
can_break_inside(child)):
# TODO: replace -1, we use it to cut the last word
# of the line.
answer = split_inline_box(
context, box, initial_position_x,
child.position_x + child.margin_width() - 1,
initial_skip_stack, containing_block,
device_size, absolute_boxes, fixed_boxes,
line_placeholders, waiting_floats)
children = (
waiting_children[:index] +
[(index, answer[0])])
resume_at = answer[1]
break
else:
children = [waiting_children[0]]
resume_at = (waiting_children[0][0] + 1, None)
break
position_x = new_position_x
waiting_children.append((index, new_child))
if resume_at is not None:
children.extend(waiting_children)
resume_at = (index, resume_at)
break
else:
children.extend(waiting_children)
resume_at = None
new_box = box.copy_with_children(
children, is_start=is_start, is_end=resume_at is None)
[child for index, child in children],
is_start=is_start, is_end=resume_at is None)
if isinstance(box, boxes.LineBox):
# Line boxes already have a position_x which may not be the same
# as content_box_left when text-indent is non-zero.
@ -736,18 +796,19 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
if new_box.style.position == 'relative':
for absolute_box in absolute_boxes:
absolute_layout(context, absolute_box, new_box, fixed_boxes)
return new_box, resume_at, preserved_line_break
return new_box, resume_at, preserved_line_break, first_letter, last_letter
def split_text_box(context, box, available_width, line_width, skip):
"""Keep as much text as possible from a TextBox in a limitied width.
def split_text_box(context, box, available_width, skip):
"""Keep as much text as possible from a TextBox in a limited width.
Try not to overflow but always have some text in ``new_box``
Return ``(new_box, skip)``. ``skip`` is the number of UTF-8 bytes
to skip form the start of the TextBox for the next line, or ``None``
if all of the text fits.
Return ``(new_box, skip, preserved_line_break)``. ``skip`` is the number of
UTF-8 bytes to skip form the start of the TextBox for the next line, or
``None`` if all of the text fits.
Also break an preserved whitespace.
Also break on preserved line breaks.
"""
assert isinstance(box, boxes.TextBox)
@ -756,8 +817,7 @@ def split_text_box(context, box, available_width, line_width, skip):
if font_size == 0 or not text:
return None, None, False
layout, length, resume_at, width, height, baseline = split_first_line(
text, box.style, context, available_width, line_width,
box.justification_spacing)
text, box.style, context, available_width, box.justification_spacing)
assert resume_at != 0
# Convert ``length`` and ``resume_at`` from UTF-8 indexes in text
@ -801,8 +861,9 @@ def split_text_box(context, box, available_width, line_width, skip):
preserved_line_break = (length != resume_at) and between.strip(' ')
if preserved_line_break:
# See http://unicode.org/reports/tr14/
# TODO: are there others? Find Pango docs on this
assert between in ('\n', '\u2029'), (
# \r is already handled by process_whitespace
line_breaks = ('\n', '\t', '\f', '\u0085', '\u2028', '\u2029')
assert between in line_breaks, (
'Got %r between two lines. '
'Expected nothing or a preserved line break' % (between,))
resume_at += skip
@ -1013,7 +1074,7 @@ def add_word_spacing(context, box, justification_spacing, x_advance):
nb_spaces = count_spaces(box)
if nb_spaces > 0:
layout, _, resume_at, width, _, _ = split_first_line(
box.text, box.style, context, float('inf'), None,
box.text, box.style, context, float('inf'),
box.justification_spacing)
assert resume_at is None
# XXX new_box.width - box.width is always 0???
@ -1049,3 +1110,13 @@ def is_phantom_linebox(linebox):
elif child.is_in_normal_flow():
return False
return True
def can_break_inside(box):
if isinstance(box, boxes.AtomicInlineLevelBox):
return False
elif isinstance(box, boxes.TextBox):
return can_break_text(box.text, box.style['lang'])
elif isinstance(box, boxes.ParentBox):
return any(can_break_inside(child) for child in box.children)
return False

View File

@ -33,7 +33,6 @@ def list_marker_layout(context, box):
(marker.pango_layout, _, _, marker.width, marker.height,
marker.baseline) = split_first_line(
marker.text, marker.style, context, max_width=None,
line_width=None,
justification_spacing=marker.justification_spacing)
baseline = find_in_flow_baseline(box)
else:

View File

@ -246,14 +246,18 @@ def inline_line_widths(context, box, outer, is_line_start, minimum,
if minimum and child_text == ' ':
lines = [0, 0]
else:
lines = text.line_widths(
child_text, child.style, context,
width=0 if minimum else None,
justification_spacing=child.justification_spacing)
if first_line:
lines = [next(lines)]
else:
lines = list(lines)
max_width = 0 if minimum else None
lines = []
resume_at = new_resume_at = 0
while new_resume_at is not None:
resume_at += new_resume_at
_, _, new_resume_at, width, _, _ = (
text.split_first_line(
child_text[resume_at:], child.style, context,
max_width, child.justification_spacing))
lines.append(width)
if first_line:
break
else:
# http://www.w3.org/TR/css3-text/#line-break-details
# "The line breaking behavior of a replaced element
@ -631,16 +635,15 @@ def trailing_whitespace_size(context, box):
if box.style.font_size == 0 or len(stripped_text) == len(box.text):
return 0
if stripped_text:
old_box, _, _ = split_text_box(context, box, None, None, 0)
old_box, _, _ = split_text_box(context, box, None, 0)
assert old_box
stripped_box = box.copy_with_text(stripped_text)
stripped_box, resume, _ = split_text_box(
context, stripped_box, None, None, 0)
context, stripped_box, None, 0)
assert stripped_box is not None
assert resume is None
return old_box.width - stripped_box.width
else:
_, _, _, width, _, _ = split_first_line(
box.text, box.style, context, None, None,
box.justification_spacing)
box.text, box.style, context, None, box.justification_spacing)
return width

View File

@ -29,7 +29,7 @@ def make_text(text, width=None, **style):
new_style.update(style)
return split_first_line(
text, StyleDict(new_style), context=None, max_width=width,
line_width=None, justification_spacing=0)
justification_spacing=0)
@assert_no_logs

View File

@ -126,6 +126,11 @@ ffi.cdef('''
int height;
} PangoRectangle;
typedef struct {
guint is_line_break: 1;
/* ... */
} PangoLogAttr;
int pango_version (void);
double pango_units_to_double (int i);
@ -210,6 +215,10 @@ ffi.cdef('''
PangoContext * pango_layout_get_context (PangoLayout *layout);
void pango_get_log_attrs (
const char *text, int length, int level, PangoLanguage *language,
PangoLogAttr *log_attrs, int attrs_len);
// PangoCairo
@ -907,8 +916,7 @@ def create_layout(text, style, context, max_width, justification_spacing):
return layout
def split_first_line(text, style, context, max_width, line_width,
justification_spacing):
def split_first_line(text, style, context, max_width, justification_spacing):
"""Fit as much as possible in the available width for one line of text.
Return ``(layout, length, resume_at, width, height, baseline)``.
@ -933,7 +941,7 @@ def split_first_line(text, style, context, max_width, line_width,
# Step #1: Get a draft layout with the first line
layout = None
if max_width is not None and max_width != float('inf'):
if max_width is not None and max_width != float('inf') and style.font_size:
expected_length = int(max_width / style.font_size * 2.5)
if expected_length < len(text):
# Try to use a small amount of text instead of the whole text
@ -955,7 +963,7 @@ def split_first_line(text, style, context, max_width, line_width,
second_line = next(lines, None)
resume_at = None if second_line is None else second_line.start_index
# Step #2: Don't hyphenize when it's not needed
# Step #2: Don't split lines when it's not needed
if max_width is None:
# The first line can take all the place needed
return first_line_metrics(
@ -1127,7 +1135,7 @@ def split_first_line(text, style, context, max_width, line_width,
# The way new lines are processed in this function (one by one with no
# memory of the last) prevents shaping characters (arabic, for
# instance) from keeping their shape when wrapped on the next line with
# pango layout. Maybe insert Unicode shaping characters in text ?
# pango layout. Maybe insert Unicode shaping characters in text?
layout.set_text(text)
pango.pango_layout_set_width(
layout.layout, units_from_double(max_width))
@ -1158,14 +1166,6 @@ def split_first_line(text, style, context, max_width, line_width,
style.hyphenate_character)
def line_widths(text, style, context, width, justification_spacing):
"""Return the width for each line."""
layout = create_layout(text, style, context, width, justification_spacing)
for line in layout.iter_lines():
width, _height = get_size(line, style)
yield width
def show_first_line(context, pango_layout, hinting):
"""Draw the given ``line`` to the Cairo ``context``."""
context = ffi.cast('cairo_t *', context._pointer)
@ -1177,3 +1177,21 @@ def show_first_line(context, pango_layout, hinting):
pango.pango_layout_set_width(pango_layout.layout, -1)
pangocairo.pango_cairo_show_layout_line(
context, next(pango_layout.iter_lines()))
def can_break_text(text, lang):
if not text or len(text) < 2:
return False
if lang:
lang_p, lang = unicode_to_char_p(lang)
else:
lang = None
language = pango.pango_language_get_default()
if lang:
language = pango.pango_language_from_string(lang_p)
text_p, bytestring = unicode_to_char_p(text)
length = len(bytestring) + 1
log_attrs = ffi.new('PangoLogAttr[]', length)
pango.pango_get_log_attrs(
text_p, len(bytestring), -1, language, log_attrs, length)
return any(attr.is_line_break for attr in log_attrs[1:length - 1])