mirror of
https://github.com/Kozea/WeasyPrint.git
synced 2024-10-05 00:21:15 +03:00
Merge pull request #528 from Kozea/text
Fix the line breaking algorithm
This commit is contained in:
commit
4e3cfb5b1d
@ -337,7 +337,7 @@ def strip_whitespace(string):
|
||||
http://www.whatwg.org/html#space-character
|
||||
|
||||
"""
|
||||
return string.strip(' \t\n\f\r')
|
||||
return string.strip(HTML_WHITESPACE)
|
||||
|
||||
|
||||
# YYYY (eg 1997)
|
||||
|
@ -16,7 +16,7 @@ import unicodedata
|
||||
|
||||
from ..css.computed_values import ex_ratio, strut_layout
|
||||
from ..formatting_structure import boxes
|
||||
from ..text import split_first_line
|
||||
from ..text import can_break_text, split_first_line
|
||||
from .absolute import AbsolutePlaceholder, absolute_layout
|
||||
from .float import avoid_collisions, float_layout
|
||||
from .min_max import handle_min_max_height, handle_min_max_width
|
||||
@ -101,7 +101,8 @@ def get_next_linebox(context, linebox, position_y, skip_stack,
|
||||
line_fixed = []
|
||||
waiting_floats = []
|
||||
|
||||
line, resume_at, preserved_line_break = split_inline_box(
|
||||
(line, resume_at, preserved_line_break, first_letter,
|
||||
last_letter) = split_inline_box(
|
||||
context, linebox, position_x, max_x, skip_stack,
|
||||
containing_block, device_size, line_absolutes,
|
||||
line_fixed, line_placeholders, waiting_floats)
|
||||
@ -226,7 +227,7 @@ def remove_last_whitespace(context, box):
|
||||
if len(new_text) == len(box.text):
|
||||
return
|
||||
box.text = new_text
|
||||
new_box, resume, _ = split_text_box(context, box, None, None, 0)
|
||||
new_box, resume, _ = split_text_box(context, box, None, 0)
|
||||
assert new_box is not None
|
||||
assert resume is None
|
||||
space_width = box.width - new_box.width
|
||||
@ -546,9 +547,10 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
|
||||
fixed_boxes, line_placeholders, waiting_floats):
|
||||
"""Fit as much content as possible from an inline-level box in a width.
|
||||
|
||||
Return ``(new_box, resume_at)``. ``resume_at`` is ``None`` if all of the
|
||||
content fits. Otherwise it can be passed as a ``skip_stack`` parameter
|
||||
to resume where we left off.
|
||||
Return ``(new_box, resume_at, preserved_line_break, first_letter,
|
||||
last_letter)``. ``resume_at`` is ``None`` if all of the content
|
||||
fits. Otherwise it can be passed as a ``skip_stack`` parameter to resume
|
||||
where we left off.
|
||||
|
||||
``new_box`` is non-empty (unless the box is empty) and as big as possible
|
||||
while being narrower than ``available_width``, if possible (may overflow
|
||||
@ -566,18 +568,24 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
|
||||
assert skip_stack is None
|
||||
|
||||
new_box, skip, preserved_line_break = split_text_box(
|
||||
context, box, max_x - position_x, max_x, skip)
|
||||
context, box, max_x - position_x, skip)
|
||||
|
||||
if skip is None:
|
||||
resume_at = None
|
||||
else:
|
||||
resume_at = (skip, None)
|
||||
if new_box and new_box.text:
|
||||
first_letter = new_box.text[0]
|
||||
last_letter = new_box.text[-1]
|
||||
else:
|
||||
first_letter = last_letter = None
|
||||
elif isinstance(box, boxes.InlineBox):
|
||||
if box.margin_left == 'auto':
|
||||
box.margin_left = 0
|
||||
if box.margin_right == 'auto':
|
||||
box.margin_right = 0
|
||||
new_box, resume_at, preserved_line_break = split_inline_box(
|
||||
(new_box, resume_at, preserved_line_break, first_letter,
|
||||
last_letter) = split_inline_box(
|
||||
context, box, position_x, max_x, skip_stack, containing_block,
|
||||
device_size, absolute_boxes, fixed_boxes, line_placeholders,
|
||||
waiting_floats)
|
||||
@ -588,8 +596,12 @@ def split_inline_level(context, box, position_x, max_x, skip_stack,
|
||||
new_box.position_x = position_x
|
||||
resume_at = None
|
||||
preserved_line_break = False
|
||||
# See https://www.w3.org/TR/css-text-3/#line-breaking
|
||||
# Atomic inlines behave like ideographic characters.
|
||||
first_letter = '\u2e80'
|
||||
last_letter = '\u2e80'
|
||||
# else: unexpected box type here
|
||||
return new_box, resume_at, preserved_line_break
|
||||
return new_box, resume_at, preserved_line_break, first_letter, last_letter
|
||||
|
||||
|
||||
def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
@ -607,6 +619,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
|
||||
is_start = skip_stack is None
|
||||
initial_position_x = position_x
|
||||
initial_skip_stack = skip_stack
|
||||
assert isinstance(box, (boxes.LineBox, boxes.InlineBox))
|
||||
left_spacing = (box.padding_left + box.margin_left +
|
||||
box.border_left_width)
|
||||
@ -617,7 +630,9 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
content_box_left = position_x
|
||||
|
||||
children = []
|
||||
waiting_children = []
|
||||
preserved_line_break = False
|
||||
first_letter = last_letter = None
|
||||
|
||||
if box.style.position == 'relative':
|
||||
absolute_boxes = []
|
||||
@ -633,7 +648,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
child.position_x = position_x
|
||||
placeholder = AbsolutePlaceholder(child)
|
||||
line_placeholders.append(placeholder)
|
||||
children.append(placeholder)
|
||||
waiting_children.append((index, placeholder))
|
||||
if child.style.position == 'absolute':
|
||||
absolute_boxes.append(placeholder)
|
||||
else:
|
||||
@ -647,7 +662,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
# To retrieve the real available space for floats, we must remove
|
||||
# the trailing whitespaces from the line
|
||||
non_floating_children = [
|
||||
child_ for child_ in children if not child_.is_floated()]
|
||||
child_ for _, child_ in children if not child_.is_floated()]
|
||||
if non_floating_children:
|
||||
float_width -= trailing_whitespace_size(
|
||||
context, non_floating_children[-1])
|
||||
@ -660,9 +675,9 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
child = float_layout(
|
||||
context, child, containing_block, device_size,
|
||||
absolute_boxes, fixed_boxes)
|
||||
children.append(child)
|
||||
waiting_children.append((index, child))
|
||||
# TODO: use the main text direction of the line
|
||||
for old_child in children[:index]:
|
||||
for _, old_child in children[:index]:
|
||||
if not old_child.is_in_normal_flow():
|
||||
continue
|
||||
if child.style.float == 'left': # and direction is ltr
|
||||
@ -675,7 +690,7 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
max_x -= max(child.margin_width(), 0)
|
||||
continue
|
||||
|
||||
new_child, resume_at, preserved = split_inline_level(
|
||||
new_child, resume_at, preserved, first, last = split_inline_level(
|
||||
context, child, position_x, max_x, skip_stack, containing_block,
|
||||
device_size, absolute_boxes, fixed_boxes, line_placeholders,
|
||||
waiting_floats)
|
||||
@ -683,6 +698,20 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
if preserved:
|
||||
preserved_line_break = True
|
||||
|
||||
if None in (last_letter, first):
|
||||
can_break = True
|
||||
else:
|
||||
can_break = can_break_text(
|
||||
last_letter + first, child.style['lang'])
|
||||
|
||||
if can_break:
|
||||
children.extend(waiting_children)
|
||||
waiting_children = []
|
||||
|
||||
if first_letter is None:
|
||||
first_letter = first
|
||||
last_letter = last
|
||||
|
||||
# TODO: this is non-optimal when last_child is True and
|
||||
# width <= remaining_width < width + right_spacing
|
||||
# with
|
||||
@ -697,23 +726,54 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
margin_width = new_child.margin_width()
|
||||
new_position_x = position_x + margin_width
|
||||
|
||||
if (new_position_x > max_x and children):
|
||||
# too wide, and the inline is non-empty:
|
||||
# put child entirely on the next line.
|
||||
resume_at = (index, None)
|
||||
break
|
||||
else:
|
||||
position_x = new_position_x
|
||||
children.append(new_child)
|
||||
if new_position_x > max_x:
|
||||
if children:
|
||||
# too wide, and the inline is non-empty:
|
||||
# put child entirely on the next line.
|
||||
resume_at = (children[-1][0] + 1, None)
|
||||
break
|
||||
elif waiting_children:
|
||||
# too wide, the inline is empty, we tried to add children
|
||||
# but can't split the line between them: split the last
|
||||
# child that can be split inside.
|
||||
# TODO: we should take care of children added into
|
||||
# absolute_boxes, fixed_boxes and other lists.
|
||||
for index, child in reversed(waiting_children):
|
||||
# TODO: what about relative children?
|
||||
if (child.is_in_normal_flow() and
|
||||
can_break_inside(child)):
|
||||
# TODO: replace -1, we use it to cut the last word
|
||||
# of the line.
|
||||
answer = split_inline_box(
|
||||
context, box, initial_position_x,
|
||||
child.position_x + child.margin_width() - 1,
|
||||
initial_skip_stack, containing_block,
|
||||
device_size, absolute_boxes, fixed_boxes,
|
||||
line_placeholders, waiting_floats)
|
||||
children = (
|
||||
waiting_children[:index] +
|
||||
[(index, answer[0])])
|
||||
resume_at = answer[1]
|
||||
break
|
||||
else:
|
||||
children = [waiting_children[0]]
|
||||
resume_at = (waiting_children[0][0] + 1, None)
|
||||
break
|
||||
|
||||
position_x = new_position_x
|
||||
waiting_children.append((index, new_child))
|
||||
|
||||
if resume_at is not None:
|
||||
children.extend(waiting_children)
|
||||
resume_at = (index, resume_at)
|
||||
break
|
||||
else:
|
||||
children.extend(waiting_children)
|
||||
resume_at = None
|
||||
|
||||
new_box = box.copy_with_children(
|
||||
children, is_start=is_start, is_end=resume_at is None)
|
||||
[child for index, child in children],
|
||||
is_start=is_start, is_end=resume_at is None)
|
||||
if isinstance(box, boxes.LineBox):
|
||||
# Line boxes already have a position_x which may not be the same
|
||||
# as content_box_left when text-indent is non-zero.
|
||||
@ -736,18 +796,19 @@ def split_inline_box(context, box, position_x, max_x, skip_stack,
|
||||
if new_box.style.position == 'relative':
|
||||
for absolute_box in absolute_boxes:
|
||||
absolute_layout(context, absolute_box, new_box, fixed_boxes)
|
||||
return new_box, resume_at, preserved_line_break
|
||||
return new_box, resume_at, preserved_line_break, first_letter, last_letter
|
||||
|
||||
|
||||
def split_text_box(context, box, available_width, line_width, skip):
|
||||
"""Keep as much text as possible from a TextBox in a limitied width.
|
||||
def split_text_box(context, box, available_width, skip):
|
||||
"""Keep as much text as possible from a TextBox in a limited width.
|
||||
|
||||
Try not to overflow but always have some text in ``new_box``
|
||||
|
||||
Return ``(new_box, skip)``. ``skip`` is the number of UTF-8 bytes
|
||||
to skip form the start of the TextBox for the next line, or ``None``
|
||||
if all of the text fits.
|
||||
Return ``(new_box, skip, preserved_line_break)``. ``skip`` is the number of
|
||||
UTF-8 bytes to skip form the start of the TextBox for the next line, or
|
||||
``None`` if all of the text fits.
|
||||
|
||||
Also break an preserved whitespace.
|
||||
Also break on preserved line breaks.
|
||||
|
||||
"""
|
||||
assert isinstance(box, boxes.TextBox)
|
||||
@ -756,8 +817,7 @@ def split_text_box(context, box, available_width, line_width, skip):
|
||||
if font_size == 0 or not text:
|
||||
return None, None, False
|
||||
layout, length, resume_at, width, height, baseline = split_first_line(
|
||||
text, box.style, context, available_width, line_width,
|
||||
box.justification_spacing)
|
||||
text, box.style, context, available_width, box.justification_spacing)
|
||||
assert resume_at != 0
|
||||
|
||||
# Convert ``length`` and ``resume_at`` from UTF-8 indexes in text
|
||||
@ -801,8 +861,9 @@ def split_text_box(context, box, available_width, line_width, skip):
|
||||
preserved_line_break = (length != resume_at) and between.strip(' ')
|
||||
if preserved_line_break:
|
||||
# See http://unicode.org/reports/tr14/
|
||||
# TODO: are there others? Find Pango docs on this
|
||||
assert between in ('\n', '\u2029'), (
|
||||
# \r is already handled by process_whitespace
|
||||
line_breaks = ('\n', '\t', '\f', '\u0085', '\u2028', '\u2029')
|
||||
assert between in line_breaks, (
|
||||
'Got %r between two lines. '
|
||||
'Expected nothing or a preserved line break' % (between,))
|
||||
resume_at += skip
|
||||
@ -1013,7 +1074,7 @@ def add_word_spacing(context, box, justification_spacing, x_advance):
|
||||
nb_spaces = count_spaces(box)
|
||||
if nb_spaces > 0:
|
||||
layout, _, resume_at, width, _, _ = split_first_line(
|
||||
box.text, box.style, context, float('inf'), None,
|
||||
box.text, box.style, context, float('inf'),
|
||||
box.justification_spacing)
|
||||
assert resume_at is None
|
||||
# XXX new_box.width - box.width is always 0???
|
||||
@ -1049,3 +1110,13 @@ def is_phantom_linebox(linebox):
|
||||
elif child.is_in_normal_flow():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def can_break_inside(box):
|
||||
if isinstance(box, boxes.AtomicInlineLevelBox):
|
||||
return False
|
||||
elif isinstance(box, boxes.TextBox):
|
||||
return can_break_text(box.text, box.style['lang'])
|
||||
elif isinstance(box, boxes.ParentBox):
|
||||
return any(can_break_inside(child) for child in box.children)
|
||||
return False
|
||||
|
@ -33,7 +33,6 @@ def list_marker_layout(context, box):
|
||||
(marker.pango_layout, _, _, marker.width, marker.height,
|
||||
marker.baseline) = split_first_line(
|
||||
marker.text, marker.style, context, max_width=None,
|
||||
line_width=None,
|
||||
justification_spacing=marker.justification_spacing)
|
||||
baseline = find_in_flow_baseline(box)
|
||||
else:
|
||||
|
@ -246,14 +246,18 @@ def inline_line_widths(context, box, outer, is_line_start, minimum,
|
||||
if minimum and child_text == ' ':
|
||||
lines = [0, 0]
|
||||
else:
|
||||
lines = text.line_widths(
|
||||
child_text, child.style, context,
|
||||
width=0 if minimum else None,
|
||||
justification_spacing=child.justification_spacing)
|
||||
if first_line:
|
||||
lines = [next(lines)]
|
||||
else:
|
||||
lines = list(lines)
|
||||
max_width = 0 if minimum else None
|
||||
lines = []
|
||||
resume_at = new_resume_at = 0
|
||||
while new_resume_at is not None:
|
||||
resume_at += new_resume_at
|
||||
_, _, new_resume_at, width, _, _ = (
|
||||
text.split_first_line(
|
||||
child_text[resume_at:], child.style, context,
|
||||
max_width, child.justification_spacing))
|
||||
lines.append(width)
|
||||
if first_line:
|
||||
break
|
||||
else:
|
||||
# http://www.w3.org/TR/css3-text/#line-break-details
|
||||
# "The line breaking behavior of a replaced element
|
||||
@ -631,16 +635,15 @@ def trailing_whitespace_size(context, box):
|
||||
if box.style.font_size == 0 or len(stripped_text) == len(box.text):
|
||||
return 0
|
||||
if stripped_text:
|
||||
old_box, _, _ = split_text_box(context, box, None, None, 0)
|
||||
old_box, _, _ = split_text_box(context, box, None, 0)
|
||||
assert old_box
|
||||
stripped_box = box.copy_with_text(stripped_text)
|
||||
stripped_box, resume, _ = split_text_box(
|
||||
context, stripped_box, None, None, 0)
|
||||
context, stripped_box, None, 0)
|
||||
assert stripped_box is not None
|
||||
assert resume is None
|
||||
return old_box.width - stripped_box.width
|
||||
else:
|
||||
_, _, _, width, _, _ = split_first_line(
|
||||
box.text, box.style, context, None, None,
|
||||
box.justification_spacing)
|
||||
box.text, box.style, context, None, box.justification_spacing)
|
||||
return width
|
||||
|
@ -29,7 +29,7 @@ def make_text(text, width=None, **style):
|
||||
new_style.update(style)
|
||||
return split_first_line(
|
||||
text, StyleDict(new_style), context=None, max_width=width,
|
||||
line_width=None, justification_spacing=0)
|
||||
justification_spacing=0)
|
||||
|
||||
|
||||
@assert_no_logs
|
||||
|
@ -126,6 +126,11 @@ ffi.cdef('''
|
||||
int height;
|
||||
} PangoRectangle;
|
||||
|
||||
typedef struct {
|
||||
guint is_line_break: 1;
|
||||
/* ... */
|
||||
} PangoLogAttr;
|
||||
|
||||
int pango_version (void);
|
||||
|
||||
double pango_units_to_double (int i);
|
||||
@ -210,6 +215,10 @@ ffi.cdef('''
|
||||
|
||||
PangoContext * pango_layout_get_context (PangoLayout *layout);
|
||||
|
||||
void pango_get_log_attrs (
|
||||
const char *text, int length, int level, PangoLanguage *language,
|
||||
PangoLogAttr *log_attrs, int attrs_len);
|
||||
|
||||
|
||||
// PangoCairo
|
||||
|
||||
@ -907,8 +916,7 @@ def create_layout(text, style, context, max_width, justification_spacing):
|
||||
return layout
|
||||
|
||||
|
||||
def split_first_line(text, style, context, max_width, line_width,
|
||||
justification_spacing):
|
||||
def split_first_line(text, style, context, max_width, justification_spacing):
|
||||
"""Fit as much as possible in the available width for one line of text.
|
||||
|
||||
Return ``(layout, length, resume_at, width, height, baseline)``.
|
||||
@ -933,7 +941,7 @@ def split_first_line(text, style, context, max_width, line_width,
|
||||
|
||||
# Step #1: Get a draft layout with the first line
|
||||
layout = None
|
||||
if max_width is not None and max_width != float('inf'):
|
||||
if max_width is not None and max_width != float('inf') and style.font_size:
|
||||
expected_length = int(max_width / style.font_size * 2.5)
|
||||
if expected_length < len(text):
|
||||
# Try to use a small amount of text instead of the whole text
|
||||
@ -955,7 +963,7 @@ def split_first_line(text, style, context, max_width, line_width,
|
||||
second_line = next(lines, None)
|
||||
resume_at = None if second_line is None else second_line.start_index
|
||||
|
||||
# Step #2: Don't hyphenize when it's not needed
|
||||
# Step #2: Don't split lines when it's not needed
|
||||
if max_width is None:
|
||||
# The first line can take all the place needed
|
||||
return first_line_metrics(
|
||||
@ -1127,7 +1135,7 @@ def split_first_line(text, style, context, max_width, line_width,
|
||||
# The way new lines are processed in this function (one by one with no
|
||||
# memory of the last) prevents shaping characters (arabic, for
|
||||
# instance) from keeping their shape when wrapped on the next line with
|
||||
# pango layout. Maybe insert Unicode shaping characters in text ?
|
||||
# pango layout. Maybe insert Unicode shaping characters in text?
|
||||
layout.set_text(text)
|
||||
pango.pango_layout_set_width(
|
||||
layout.layout, units_from_double(max_width))
|
||||
@ -1158,14 +1166,6 @@ def split_first_line(text, style, context, max_width, line_width,
|
||||
style.hyphenate_character)
|
||||
|
||||
|
||||
def line_widths(text, style, context, width, justification_spacing):
|
||||
"""Return the width for each line."""
|
||||
layout = create_layout(text, style, context, width, justification_spacing)
|
||||
for line in layout.iter_lines():
|
||||
width, _height = get_size(line, style)
|
||||
yield width
|
||||
|
||||
|
||||
def show_first_line(context, pango_layout, hinting):
|
||||
"""Draw the given ``line`` to the Cairo ``context``."""
|
||||
context = ffi.cast('cairo_t *', context._pointer)
|
||||
@ -1177,3 +1177,21 @@ def show_first_line(context, pango_layout, hinting):
|
||||
pango.pango_layout_set_width(pango_layout.layout, -1)
|
||||
pangocairo.pango_cairo_show_layout_line(
|
||||
context, next(pango_layout.iter_lines()))
|
||||
|
||||
|
||||
def can_break_text(text, lang):
|
||||
if not text or len(text) < 2:
|
||||
return False
|
||||
if lang:
|
||||
lang_p, lang = unicode_to_char_p(lang)
|
||||
else:
|
||||
lang = None
|
||||
language = pango.pango_language_get_default()
|
||||
if lang:
|
||||
language = pango.pango_language_from_string(lang_p)
|
||||
text_p, bytestring = unicode_to_char_p(text)
|
||||
length = len(bytestring) + 1
|
||||
log_attrs = ffi.new('PangoLogAttr[]', length)
|
||||
pango.pango_get_log_attrs(
|
||||
text_p, len(bytestring), -1, language, log_attrs, length)
|
||||
return any(attr.is_line_break for attr in log_attrs[1:length - 1])
|
||||
|
Loading…
Reference in New Issue
Block a user