Fix detection of URLs in HTML source code (URLs inside quotes)

Fixes #785
This commit is contained in:
Kovid Goyal 2018-08-03 12:28:23 +05:30
parent 8d20dbe81d
commit e5a720c6fa
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 49 additions and 17 deletions

View File

@ -42,6 +42,9 @@ Changelog
(:iss:`754`)
- Fix detection of URLs in HTML source code (URLs inside quotes) (:iss:`785`)
0.11.3 [2018-07-10]
------------------------------

View File

@ -170,7 +170,7 @@ def regex_finditer(pat, minimum_match_length, text):
yield s, e
closing_bracket_map = {'(': ')', '[': ']', '{': '}', '<': '>'}
closing_bracket_map = {'(': ')', '[': ']', '{': '}', '<': '>', '*': '*', '"': '"', "'": "'"}
opening_brackets = ''.join(closing_bracket_map)
postprocessor_map = {}
@ -189,15 +189,15 @@ def url(text, s, e):
e -= len(url) - idx
while text[e - 1] in '.,?!' and e > 1: # remove trailing punctuation
e -= 1
# truncate url at closing bracket/quote
if s > 0 and e <= len(text) and text[s-1] in opening_brackets:
q = closing_bracket_map[text[s-1]]
idx = text.find(q, s)
if idx > s:
e = idx
# Restructured Text URLs
if e > 3 and text[e-2:e] == '`_':
e -= 2
# Remove trailing bracket if matched by leading bracket
if s > 0 and e < len(text) and text[s-1] in opening_brackets and text[e-1] == closing_bracket_map[text[s-1]]:
e -= 1
# Remove trailing quote if matched by leading quote
if s > 0 and e < len(text) and text[s-1] in '\'"' and text[e-1] == text[s-1]:
e -= 1
return s, e

View File

@ -132,10 +132,11 @@ line_url_start_at(Line *self, index_type x) {
}
index_type
line_url_end_at(Line *self, index_type x, bool check_short) {
line_url_end_at(Line *self, index_type x, bool check_short, char_type sentinel) {
index_type ans = x;
if (x >= self->xnum || (check_short && self->xnum <= MIN_URL_LEN + 3)) return 0;
while (ans < self->xnum && is_url_char(self->cpu_cells[ans].ch)) ans++;
if (sentinel) { while (ans < self->xnum && self->cpu_cells[ans].ch != sentinel && is_url_char(self->cpu_cells[ans].ch)) ans++; }
else { while (ans < self->xnum && is_url_char(self->cpu_cells[ans].ch)) ans++; }
if (ans) ans--;
while (ans > x && can_strip_from_end_of_url(self->cpu_cells[ans].ch)) ans--;
return ans;
@ -148,9 +149,11 @@ url_start_at(Line *self, PyObject *x) {
}
static PyObject*
url_end_at(Line *self, PyObject *x) {
url_end_at(Line *self, PyObject *args) {
#define url_end_at_doc "url_end_at(x) -> Return the end cell number for a URL containing x or 0 if not found"
return PyLong_FromUnsignedLong((unsigned long)line_url_end_at(self, PyLong_AsUnsignedLong(x), true));
unsigned int x, sentinel = 0;
if (!PyArg_ParseTuple(args, "I|I", &x, &sentinel)) return NULL;
return PyLong_FromUnsignedLong((unsigned long)line_url_end_at(self, x, true, sentinel));
}
// }}}
@ -560,7 +563,7 @@ static PyMethodDef methods[] = {
METHOD(is_continued, METH_NOARGS)
METHOD(width, METH_O)
METHOD(url_start_at, METH_O)
METHOD(url_end_at, METH_O)
METHOD(url_end_at, METH_VARARGS)
METHOD(sprite_at, METH_O)
{NULL} /* Sentinel */

View File

@ -59,7 +59,7 @@ void line_set_char(Line *, unsigned int , uint32_t , unsigned int , Cursor *, bo
void line_right_shift(Line *, unsigned int , unsigned int );
void line_add_combining_char(Line *, uint32_t , unsigned int );
index_type line_url_start_at(Line *self, index_type x);
index_type line_url_end_at(Line *self, index_type x, bool);
index_type line_url_end_at(Line *self, index_type x, bool, char_type);
index_type line_as_ansi(Line *self, Py_UCS4 *buf, index_type buflen);
unsigned int line_length(Line *self);
size_t cell_as_unicode(CPUCell *cell, bool include_cc, Py_UCS4 *buf, char_type);

View File

@ -209,32 +209,57 @@ extend_selection(Window *w) {
}
static inline void
extend_url(Screen *screen, Line *line, index_type *x, index_type *y) {
extend_url(Screen *screen, Line *line, index_type *x, index_type *y, char_type sentinel) {
unsigned int count = 0;
while(count++ < 10) {
if (*x != line->xnum - 1) break;
line = screen_visual_line(screen, *y + 1);
if (!line) break; // we deliberately allow non-continued lines as some programs, like mutt split URLs with newlines at line boundaries
index_type new_x = line_url_end_at(line, 0, false);
index_type new_x = line_url_end_at(line, 0, false, sentinel);
if (!new_x) break;
*y += 1; *x = new_x;
}
}
static inline char_type
get_url_sentinel(Line *line, index_type url_start) {
char_type before = 0, sentinel;
if (url_start > 0 && url_start < line->xnum) before = line->cpu_cells[url_start - 1].ch;
switch(before) {
case '"':
case '\'':
case '*':
sentinel = before; break;
case '(':
sentinel = ')'; break;
case '[':
sentinel = ']'; break;
case '{':
sentinel = '}'; break;
case '<':
sentinel = '>'; break;
default:
sentinel = 0; break;
}
return sentinel;
}
static inline void
detect_url(Screen *screen, unsigned int x, unsigned int y) {
bool has_url = false;
index_type url_start, url_end = 0;
Line *line = screen_visual_line(screen, y);
char_type sentinel;
if (line) {
url_start = line_url_start_at(line, x);
if (url_start < line->xnum) url_end = line_url_end_at(line, x, true);
sentinel = get_url_sentinel(line, url_start);
if (url_start < line->xnum) url_end = line_url_end_at(line, x, true, sentinel);
has_url = url_end > url_start;
}
if (has_url) {
mouse_cursor_shape = HAND;
index_type y_extended = y;
extend_url(screen, line, &url_end, &y_extended);
extend_url(screen, line, &url_end, &y_extended, sentinel);
screen_mark_url(screen, url_start, y, url_end, y_extended);
} else {
mouse_cursor_shape = BEAM;

View File

@ -29,3 +29,4 @@ class TestHints(BaseTest):
t(u + '\nxxx', u + 'xxx', len(u))
t('link:{}[xxx]'.format(u), u)
t('`xyz <{}>`_.'.format(u), u)
t('<a href="{}">moo'.format(u), u)