Fine tune the artificial delays in the render loop

There are now two numbers, repaint_delay and input_delay
that control how often the screen is repainted and how frequently
input received from the child process is processed.

This halves the CPU usage in intensive cases such as scrolling
a file in less. The CPU usage of kitty + X when scrolling is now
significantly lower than all the other terminals on my system.

MROAWR!

...
This commit is contained in:
Kovid Goyal 2017-09-16 08:10:19 +05:30
parent 43ebddc28f
commit 728f33700a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 56 additions and 47 deletions

View File

@ -124,7 +124,7 @@ or a similar package manager)
kitty is designed for power keyboard users. To that end all its controls
work with the keyboard (although it fully supports mouse interactions as
well). Its configuration is a simple, human editable, single file for
easy reproducability (I like to store config files in source control).
easy reproducibility (I like to store config files in source control).
The code in kitty is designed to be simple, modular and hackable. It is
written in a mix of C (for performance sensitive parts) and Python (for
@ -296,10 +296,15 @@ link:kitty/kitty.conf[config file].
== Performance
The main goals for kitty performance are user perceived latency while typing
and "smoothness" while scrolling. kitty tries hard to optimize these. To that
end it keeps a cache of each rendered glyph in video RAM so that font rendering
is not a bottleneck. Interaction with child programs takes place in a separate
thread from rendering, to improve smoothness.
and "smoothness" while scrolling as well as CPU usage. kitty tries hard to find
an optimum balance for these. To that end it keeps a cache of each rendered
glyph in video RAM so that font rendering is not a bottleneck. Interaction
with child programs takes place in a separate thread from rendering, to improve
smoothness.
There are two parameters you can tune to adjust the performance. ``repaint_delay``
and ``input_delay``. These control the artificial delays introduced into the
render loop to reduce CPU usage. See the link:kitty/kitty.conf[config file] for details.
You can generate detailed per-function performance data using
link:https://github.com/gperftools/gperftools[gperftools]. Build kitty with the

View File

@ -60,7 +60,7 @@ def __init__(self, glfw_window, opts, args):
self.glfw_window_title = None
self.shutting_down = False
self.child_monitor = ChildMonitor(
opts.repaint_delay / 1000.0, glfw_window.window_id(),
glfw_window.window_id(),
self.on_child_death,
DumpCommands(args) if args.dump_commands or args.dump_bytes else None)
set_boss(self)

View File

@ -30,6 +30,7 @@ extern int pthread_setname_np(const char *name);
#include <GLFW/glfw3.h>
#define EXTRA_FDS 2
#define wakeup_main_loop glfwPostEmptyEvent
static void (*parse_func)(Screen*, PyObject*);
@ -123,10 +124,9 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
ChildMonitor *self;
PyObject *dump_callback, *death_notify, *wid;
int ret;
double repaint_delay;
if (the_monitor) { PyErr_SetString(PyExc_RuntimeError, "Can have only a single ChildMonitor instance"); return NULL; }
if (!PyArg_ParseTuple(args, "dOOO", &repaint_delay, &wid, &death_notify, &dump_callback)) return NULL;
if (!PyArg_ParseTuple(args, "OOO", &wid, &death_notify, &dump_callback)) return NULL;
glfw_window_id = PyLong_AsVoidPtr(wid);
if ((ret = pthread_mutex_init(&children_lock, NULL)) != 0) {
PyErr_Format(PyExc_RuntimeError, "Failed to create children_lock mutex: %s", strerror(ret));
@ -148,7 +148,6 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
self->count = 0;
fds[0].fd = wakeup_fds[0]; fds[1].fd = signal_fds[0];
fds[0].events = POLLIN; fds[1].events = POLLIN;
self->repaint_delay = repaint_delay;
the_monitor = self;
return (PyObject*) self;
@ -175,7 +174,7 @@ dealloc(ChildMonitor* self) {
}
static void
wakeup_() {
wakeup_io_loop() {
while(true) {
ssize_t ret = write(wakeup_fds[1], "w", 1);
if (ret < 0) {
@ -208,7 +207,7 @@ join(ChildMonitor *self) {
static PyObject *
wakeup(ChildMonitor UNUSED *self) {
#define wakeup_doc "wakeup() -> wakeup the ChildMonitor I/O thread, forcing it to exit from poll() if it is waiting there."
wakeup_();
wakeup_io_loop();
Py_RETURN_NONE;
}
@ -258,7 +257,7 @@ schedule_write_to_child(unsigned long id, const char *data, size_t sz) {
screen->write_buf = PyMem_RawRealloc(screen->write_buf, screen->write_buf_sz);
if (screen->write_buf == NULL) { fatal("Out of memory."); }
}
if (screen->write_buf_used) wakeup_();
if (screen->write_buf_used) wakeup_io_loop();
screen_mutex(unlock, write);
break;
}
@ -286,31 +285,26 @@ shutdown(ChildMonitor *self) {
Py_RETURN_NONE;
}
static inline bool
do_parse(ChildMonitor *self, Screen *screen) {
bool updated = false;
static inline void
do_parse(ChildMonitor *self, Screen *screen, double now) {
screen_mutex(lock, read);
if (screen->read_buf_sz) {
parse_func(screen, self->dump_callback);
if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_(); // Ensure the read fd has POLLIN set
screen->read_buf_sz = 0;
updated = true;
double time_since_new_input = now - screen->new_input_at;
if (time_since_new_input >= OPT(input_delay)) {
parse_func(screen, self->dump_callback);
if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_io_loop(); // Ensure the read fd has POLLIN set
screen->read_buf_sz = 0;
screen->new_input_at = 0;
} else set_maximum_wait(OPT(input_delay) - time_since_new_input);
}
screen_mutex(unlock, read);
if (LIKELY(updated)) {
glfwPostEmptyEvent();
}
return updated;
}
static double last_parse_at = -1000;
static void
parse_input(ChildMonitor *self) {
// Parse all available input that was read in the I/O thread.
size_t count = 0, remove_count = 0;
double now = monotonic();
double time_since_last_parse = now - last_parse_at;
bool parse_needed = time_since_last_parse >= self->repaint_delay ? true : false;
children_mutex(lock);
while (remove_queue_count) {
remove_queue_count--;
@ -321,15 +315,11 @@ parse_input(ChildMonitor *self) {
if (UNLIKELY(signal_received)) {
glfwSetWindowShouldClose(glfw_window_id, true);
glfwPostEmptyEvent();
} else {
if (parse_needed) {
count = self->count;
for (size_t i = 0; i < count; i++) {
scratch[i] = children[i];
INCREF_CHILD(scratch[i]);
}
last_parse_at = now;
count = self->count;
for (size_t i = 0; i < count; i++) {
scratch[i] = children[i];
INCREF_CHILD(scratch[i]);
}
}
children_mutex(unlock);
@ -345,13 +335,10 @@ parse_input(ChildMonitor *self) {
for (size_t i = 0; i < count; i++) {
if (!scratch[i].needs_removal) {
do_parse(self, scratch[i].screen);
do_parse(self, scratch[i].screen, now);
}
DECREF_CHILD(scratch[i]);
}
if (!parse_needed) {
set_maximum_wait(self->repaint_delay - time_since_last_parse);
}
}
static PyObject *
@ -494,9 +481,9 @@ render_cursor(Window *w, double now) {
}
static inline bool
render(ChildMonitor *self, double now) {
render(double now) {
double time_since_last_render = now - last_render_at;
if (time_since_last_render > self->repaint_delay) {
if (time_since_last_render > OPT(repaint_delay)) {
draw_borders();
#define TD global_state.tab_bar_render_data
if (TD.screen && global_state.num_tabs > 1) draw_cells(TD.vao_idx, TD.xstart, TD.ystart, TD.dx, TD.dy, TD.screen);
@ -536,7 +523,7 @@ render(ChildMonitor *self, double now) {
glfwSwapBuffers(glfw_window_id);
last_render_at = now;
} else {
set_maximum_wait(self->repaint_delay - time_since_last_render);
set_maximum_wait(OPT(repaint_delay) - time_since_last_render);
}
return true;
}
@ -595,7 +582,7 @@ main_loop(ChildMonitor *self) {
while (!glfwWindowShouldClose(glfw_window_id)) {
double now = monotonic();
maximum_wait = -1;
if (!render(self, now)) break;
if (!render(now)) break;
if (global_state.mouse_visible && OPT(mouse_hide_wait) > 0 && now - global_state.last_mouse_activity_at > OPT(mouse_hide_wait)) {
glfwSetInputMode(glfw_window_id, GLFW_CURSOR, GLFW_CURSOR_HIDDEN);
global_state.mouse_visible = false;
@ -714,6 +701,7 @@ read_bytes(int fd, Screen *screen) {
break;
}
if (UNLIKELY(len == 0)) return false;
if (screen->new_input_at == 0) screen->new_input_at = monotonic();
screen_mutex(lock, read);
if (orig_sz != screen->read_buf_sz) {
// The other thread consumed some of the screen read buffer
@ -828,7 +816,7 @@ io_loop(void *data) {
perror("Call to poll() failed");
}
}
if (data_received) glfwPostEmptyEvent();
if (data_received) wakeup_main_loop();
}
children_mutex(lock);
for (i = 0; i < self->count; i++) children[i].needs_removal = true;

View File

@ -215,6 +215,7 @@ def adjust_line_height(x):
'cursor_opacity': to_opacity,
'open_url_modifiers': to_open_url_modifiers,
'repaint_delay': positive_int,
'input_delay': positive_int,
'window_border_width': positive_float,
'window_margin_width': positive_float,
'window_padding_width': positive_float,

View File

@ -251,6 +251,7 @@ typedef struct {
unsigned int parser_state, parser_text_start, parser_buf_pos;
bool parser_has_pending_text;
uint8_t read_buf[READ_BUF_SZ], *write_buf;
double new_input_at;
size_t read_buf_sz, write_buf_sz, write_buf_used;
pthread_mutex_t read_buf_lock, write_buf_lock;
@ -267,7 +268,6 @@ typedef struct {
PyObject_HEAD
PyObject *dump_callback, *update_screen, *death_notify;
double repaint_delay;
unsigned int count;
bool shutting_down;
pthread_t io_thread;

View File

@ -105,11 +105,18 @@ remember_window_size yes
initial_window_width 640
initial_window_height 400
# Delay (in milliseconds) between screen updates. Decreasing it, increases fps
# at the cost of more CPU usage. The default value yields ~100fps which is more
# than sufficient for most uses.
# Delay (in milliseconds) between screen updates. Decreasing it, increases
# frames-per-second (FPS) at the cost of more CPU usage. The default value
# yields ~100 FPS which is more than sufficient for most uses.
repaint_delay 10
# Delay (in milliseconds) before input from the program running in the terminal
# is processed. Note that decreasing it will increase responsiveness, but also
# increase CPU usage and might cause flicker in full screen programs that
# redraw the entire screen on each loop, because kitty is so fast that partial
# screen updates will be drawn.
input_delay 3
# Visual bell duration. Flash the screen when a bell occurs for the specified number of
# seconds. Set to zero to disable.
visual_bell_duration 0.0

View File

@ -126,6 +126,11 @@ color_as_int(PyObject *color) {
#undef I
}
static inline double
repaint_delay(PyObject *val) {
return (double)(PyLong_AsUnsignedLong(val)) / 1000.0;
}
#define dict_iter(d) { \
PyObject *key, *value; Py_ssize_t pos = 0; \
while (PyDict_Next(d, &pos, &key, &value))
@ -155,6 +160,8 @@ PYWRAP1(set_options) {
S(open_url_modifiers, PyLong_AsUnsignedLong);
S(click_interval, PyFloat_AsDouble);
S(url_color, color_as_int);
S(repaint_delay, repaint_delay);
S(input_delay, repaint_delay);
PyObject *chars = PyObject_GetAttrString(args, "select_by_word_characters");
if (chars == NULL) return NULL;

View File

@ -16,6 +16,7 @@ typedef struct {
unsigned int open_url_modifiers;
char_type select_by_word_characters[256]; size_t select_by_word_characters_count;
color_type url_color;
double repaint_delay, input_delay;
} Options;
typedef struct {