From 794e0446e94f8d0f994cda4eae47c89370e3141c Mon Sep 17 00:00:00 2001 From: Ignacio Casal Quinteiro Date: Tue, 26 Oct 2010 22:02:38 +0200 Subject: [PATCH] Add case insensitive to GtkTextIter. Fixes bug #61852. This code has been taken from GtkSourceView so also kudos to Paolo Maggi and Paolo Borelli for helping with this patch. --- gtk/gtktextiter.c | 399 +++++++++++++++++++++++++++++++++++++++------- gtk/gtktextiter.h | 7 +- 2 files changed, 347 insertions(+), 59 deletions(-) diff --git a/gtk/gtktextiter.c b/gtk/gtktextiter.c index 380778f90c..6ee68bd83b 100644 --- a/gtk/gtktextiter.c +++ b/gtk/gtktextiter.c @@ -4383,7 +4383,8 @@ static void forward_chars_with_skipping (GtkTextIter *iter, gint count, gboolean skip_invisible, - gboolean skip_nontext) + gboolean skip_nontext, + gboolean skip_decomp) { gint i; @@ -4396,6 +4397,10 @@ forward_chars_with_skipping (GtkTextIter *iter, { gboolean ignored = FALSE; + /* minimal workaround to avoid the infinite loop of bug #168247. */ + if (gtk_text_iter_is_end (iter)) + return; + if (skip_nontext && gtk_text_iter_get_char (iter) == GTK_TEXT_UNKNOWN_CHAR) ignored = TRUE; @@ -4405,6 +4410,25 @@ forward_chars_with_skipping (GtkTextIter *iter, _gtk_text_btree_char_is_invisible (iter)) ignored = TRUE; + if (!ignored && skip_decomp) + { + /* being UTF8 correct sucks: this accounts for extra + offsets coming from canonical decompositions of + UTF8 characters (e.g. accented characters) which + g_utf8_normalize() performs */ + gchar *normal; + gchar *casefold; + gchar buffer[6]; + gint buffer_len; + + buffer_len = g_unichar_to_utf8 (gtk_text_iter_get_char (iter), buffer); + casefold = g_utf8_casefold (buffer, buffer_len); + normal = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + i -= (g_utf8_strlen (normal, -1) - 1); + g_free (normal); + g_free (casefold); + } + gtk_text_iter_forward_char (iter); if (!ignored) @@ -4412,11 +4436,209 @@ forward_chars_with_skipping (GtkTextIter *iter, } } +static const gchar * +pointer_from_offset_skipping_decomp (const gchar *str, + gint offset) +{ + gchar *casefold, *normal; + const gchar *p, *q; + + p = str; + + while (offset > 0) + { + q = g_utf8_next_char (p); + casefold = g_utf8_casefold (p, q - p); + normal = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + offset -= g_utf8_strlen (normal, -1); + g_free (casefold); + g_free (normal); + p = q; + } + + return p; +} + +static gboolean +exact_prefix_cmp (const gchar *string, + const gchar *prefix, + guint prefix_len) +{ + GUnicodeType type; + + if (strncmp (string, prefix, prefix_len) != 0) + return FALSE; + if (string[prefix_len] == '\0') + return TRUE; + + type = g_unichar_type (g_utf8_get_char (string + prefix_len)); + + /* If string contains prefix, check that prefix is not followed + * by a unicode mark symbol, e.g. that trailing 'a' in prefix + * is not part of two-char a-with-hat symbol in string. */ + return type != G_UNICODE_COMBINING_MARK && + type != G_UNICODE_ENCLOSING_MARK && + type != G_UNICODE_NON_SPACING_MARK; +} + +static const gchar * +utf8_strcasestr (const gchar *haystack, + const gchar *needle) +{ + gsize needle_len; + gsize haystack_len; + const gchar *ret = NULL; + gchar *p; + gchar *casefold; + gchar *caseless_haystack; + gint i; + + g_return_val_if_fail (haystack != NULL, NULL); + g_return_val_if_fail (needle != NULL, NULL); + + casefold = g_utf8_casefold (haystack, -1); + caseless_haystack = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + + needle_len = g_utf8_strlen (needle, -1); + haystack_len = g_utf8_strlen (caseless_haystack, -1); + + if (needle_len == 0) + { + ret = (gchar *)haystack; + goto finally; + } + + if (haystack_len < needle_len) + { + ret = NULL; + goto finally; + } + + p = (gchar *)caseless_haystack; + needle_len = strlen (needle); + i = 0; + + while (*p) + { + if (exact_prefix_cmp (p, needle, needle_len)) + { + ret = pointer_from_offset_skipping_decomp (haystack, i); + goto finally; + } + + p = g_utf8_next_char (p); + i++; + } + +finally: + g_free (caseless_haystack); + + return ret; +} + +static const gchar * +utf8_strrcasestr (const gchar *haystack, + const gchar *needle) +{ + gsize needle_len; + gsize haystack_len; + const gchar *ret = NULL; + gchar *p; + gchar *casefold; + gchar *caseless_haystack; + gint i; + + g_return_val_if_fail (haystack != NULL, NULL); + g_return_val_if_fail (needle != NULL, NULL); + + casefold = g_utf8_casefold (haystack, -1); + caseless_haystack = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + + needle_len = g_utf8_strlen (needle, -1); + haystack_len = g_utf8_strlen (caseless_haystack, -1); + + if (needle_len == 0) + { + ret = (gchar *)haystack; + goto finally; + } + + if (haystack_len < needle_len) + { + ret = NULL; + goto finally; + } + + i = haystack_len - needle_len; + p = g_utf8_offset_to_pointer (caseless_haystack, i); + needle_len = strlen (needle); + + while (p >= caseless_haystack) + { + if (exact_prefix_cmp (p, needle, needle_len)) + { + ret = pointer_from_offset_skipping_decomp (haystack, i); + goto finally; + } + + p = g_utf8_prev_char (p); + i--; + } + +finally: + g_free (caseless_haystack); + + return ret; +} + +/* normalizes caseless strings and returns true if @s2 matches + the start of @s1 */ +static gboolean +utf8_caselessnmatch (const gchar *s1, + const gchar *s2, + gssize n1, + gssize n2) +{ + gchar *casefold; + gchar *normalized_s1; + gchar *normalized_s2; + gint len_s1; + gint len_s2; + gboolean ret = FALSE; + + g_return_val_if_fail (s1 != NULL, FALSE); + g_return_val_if_fail (s2 != NULL, FALSE); + g_return_val_if_fail (n1 > 0, FALSE); + g_return_val_if_fail (n2 > 0, FALSE); + + casefold = g_utf8_casefold (s1, n1); + normalized_s1 = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + + casefold = g_utf8_casefold (s2, n2); + normalized_s2 = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + + len_s1 = strlen (normalized_s1); + len_s2 = strlen (normalized_s2); + + if (len_s1 >= len_s2) + ret = (strncmp (normalized_s1, normalized_s2, len_s2) == 0); + + g_free (normalized_s1); + g_free (normalized_s2); + + return ret; +} + static gboolean lines_match (const GtkTextIter *start, const gchar **lines, gboolean visible_only, gboolean slice, + gboolean case_insensitive, GtkTextIter *match_start, GtkTextIter *match_end) { @@ -4460,14 +4682,25 @@ lines_match (const GtkTextIter *start, } if (match_start) /* if this is the first line we're matching */ - found = strstr (line_text, *lines); + { + if (!case_insensitive) + found = strstr (line_text, *lines); + else + found = utf8_strcasestr (line_text, *lines); + } else { /* If it's not the first line, we have to match from the * start of the line. */ - if (strncmp (line_text, *lines, strlen (*lines)) == 0) - found = line_text; + if ((!case_insensitive && + (strncmp (line_text, *lines, strlen (*lines)) == 0)) || + (case_insensitive && + utf8_caselessnmatch (line_text, *lines, strlen (line_text), + strlen (*lines)))) + { + found = line_text; + } else found = NULL; } @@ -4486,19 +4719,14 @@ lines_match (const GtkTextIter *start, /* If match start needs to be returned, set it to the * start of the search string. */ + forward_chars_with_skipping (&next, offset, + visible_only, !slice, FALSE); if (match_start) - { - *match_start = next; - - forward_chars_with_skipping (match_start, offset, - visible_only, !slice); - } + *match_start = next; /* Go to end of search string */ - offset += g_utf8_strlen (*lines, -1); - - forward_chars_with_skipping (&next, offset, - visible_only, !slice); + forward_chars_with_skipping (&next, g_utf8_strlen (*lines, -1), + visible_only, !slice, TRUE); g_free (line_text); @@ -4510,17 +4738,20 @@ lines_match (const GtkTextIter *start, /* pass NULL for match_start, since we don't need to find the * start again. */ - return lines_match (&next, lines, visible_only, slice, NULL, match_end); + return lines_match (&next, lines, visible_only, slice, case_insensitive, NULL, match_end); } /* strsplit () that retains the delimiter as part of the string. */ static gchar ** strbreakup (const char *string, const char *delimiter, - gint max_tokens) + gint max_tokens, + gint *num_strings, + gboolean case_insensitive) { GSList *string_list = NULL, *slist; gchar **str_array, *s; + gchar *casefold, *new_string; guint i, n = 1; g_return_val_if_fail (string != NULL, NULL); @@ -4537,12 +4768,20 @@ strbreakup (const char *string, do { guint len; - gchar *new_string; len = s - string + delimiter_len; new_string = g_new (gchar, len + 1); strncpy (new_string, string, len); new_string[len] = 0; + + if (case_insensitive) + { + casefold = g_utf8_casefold (new_string, -1); + g_free (new_string); + new_string = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + } + string_list = g_slist_prepend (string_list, new_string); n++; string = s + delimiter_len; @@ -4553,7 +4792,17 @@ strbreakup (const char *string, if (*string) { n++; - string_list = g_slist_prepend (string_list, g_strdup (string)); + + if (case_insensitive) + { + casefold = g_utf8_casefold (string, -1); + new_string = g_utf8_normalize (casefold, -1, G_NORMALIZE_NFD); + g_free (casefold); + } + else + new_string = g_strdup (string); + + string_list = g_slist_prepend (string_list, new_string); } str_array = g_new (gchar*, n); @@ -4566,6 +4815,9 @@ strbreakup (const char *string, g_slist_free (string_list); + if (num_strings != NULL) + *num_strings = n - 1; + return str_array; } @@ -4592,6 +4844,8 @@ strbreakup (const char *string, * pixbufs or child widgets mixed inside the matched range. If these * flags are not given, the match must be exact; the special 0xFFFC * character in @str will match embedded pixbufs or child widgets. + * If you specify the #GTK_TEXT_SEARCH_CASE_INSENSITIVE flag, the text will + * be matched regardless of what case it is in. * * Return value: whether a match was found **/ @@ -4609,7 +4863,8 @@ gtk_text_iter_forward_search (const GtkTextIter *iter, GtkTextIter search; gboolean visible_only; gboolean slice; - + gboolean case_insensitive; + g_return_val_if_fail (iter != NULL, FALSE); g_return_val_if_fail (str != NULL, FALSE); @@ -4640,10 +4895,11 @@ gtk_text_iter_forward_search (const GtkTextIter *iter, visible_only = (flags & GTK_TEXT_SEARCH_VISIBLE_ONLY) != 0; slice = (flags & GTK_TEXT_SEARCH_TEXT_ONLY) == 0; - + case_insensitive = (flags & GTK_TEXT_SEARCH_CASE_INSENSITIVE) != 0; + /* locate all lines */ - lines = strbreakup (str, "\n", -1); + lines = strbreakup (str, "\n", -1, NULL, case_insensitive); search = *iter; @@ -4660,7 +4916,7 @@ gtk_text_iter_forward_search (const GtkTextIter *iter, break; if (lines_match (&search, (const gchar**)lines, - visible_only, slice, &match, &end)) + visible_only, slice, case_insensitive, &match, &end)) { if (limit == NULL || (limit && @@ -4686,8 +4942,9 @@ gtk_text_iter_forward_search (const GtkTextIter *iter, } static gboolean -vectors_equal_ignoring_trailing (gchar **vec1, - gchar **vec2) +vectors_equal_ignoring_trailing (gchar **vec1, + gchar **vec2, + gboolean case_insensitive) { /* Ignores trailing chars in vec2's last line */ @@ -4698,37 +4955,67 @@ vectors_equal_ignoring_trailing (gchar **vec1, while (*i1 && *i2) { - if (strcmp (*i1, *i2) != 0) - { - if (*(i2 + 1) == NULL) /* if this is the last line */ - { - gint len1 = strlen (*i1); - gint len2 = strlen (*i2); + gint len1; + gint len2; - if (len2 >= len1 && - strncmp (*i1, *i2, len1) == 0) + if (!case_insensitive) + { + if (strcmp (*i1, *i2) != 0) + { + if (*(i2 + 1) == NULL) /* if this is the last line */ { - /* We matched ignoring the trailing stuff in vec2 */ - return TRUE; + len1 = strlen (*i1); + len2 = strlen (*i2); + + if (len2 >= len1 && + strncmp (*i1, *i2, len1) == 0) + { + /* We matched ignoring the trailing stuff in vec2 */ + return TRUE; + } + else + { + return FALSE; + } } else { return FALSE; } } - else + } + else + { + len1 = strlen (*i1); + len2 = strlen (*i2); + + if (!utf8_caselessnmatch (*i1, *i2, len1, len2)) { - return FALSE; + if (*(i2 + 1) == NULL) /* if this is the last line */ + { + if (utf8_caselessnmatch (*i2, *i1, len2, len1)) + { + /* We matched ignoring the trailing stuff in vec2 */ + return TRUE; + } + else + { + return FALSE; + } + } + else + { + return FALSE; + } } } + ++i1; ++i2; } if (*i1 || *i2) - { - return FALSE; - } + return FALSE; else return TRUE; } @@ -4739,10 +5026,12 @@ struct _LinesWindow { gint n_lines; gchar **lines; + GtkTextIter first_line_start; GtkTextIter first_line_end; - gboolean slice; - gboolean visible_only; + + guint slice : 1; + guint visible_only : 1; }; static void @@ -4896,6 +5185,7 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, gboolean retval = FALSE; gboolean visible_only; gboolean slice; + gboolean case_insensitive; g_return_val_if_fail (iter != NULL, FALSE); g_return_val_if_fail (str != NULL, FALSE); @@ -4926,18 +5216,11 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, visible_only = (flags & GTK_TEXT_SEARCH_VISIBLE_ONLY) != 0; slice = (flags & GTK_TEXT_SEARCH_TEXT_ONLY) == 0; - + case_insensitive = (flags & GTK_TEXT_SEARCH_CASE_INSENSITIVE) != 0; + /* locate all lines */ - lines = strbreakup (str, "\n", -1); - - l = lines; - n_lines = 0; - while (*l) - { - ++n_lines; - ++l; - } + lines = strbreakup (str, "\n", -1, &n_lines, case_insensitive); win.n_lines = n_lines; win.slice = slice; @@ -4950,7 +5233,7 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, do { - gchar *first_line_match; + const gchar *first_line_match; if (limit && gtk_text_iter_compare (limit, &win.first_line_end) > 0) @@ -4963,10 +5246,14 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, * end in '\n', so this will only match at the * end of the first line, which is correct. */ - first_line_match = g_strrstr (*win.lines, *lines); + if (!case_insensitive) + first_line_match = g_strrstr (*win.lines, *lines); + else + first_line_match = utf8_strrcasestr (*win.lines, *lines); if (first_line_match && - vectors_equal_ignoring_trailing (lines + 1, win.lines + 1)) + vectors_equal_ignoring_trailing (lines + 1, win.lines + 1, + case_insensitive)) { /* Match! */ gint offset; @@ -4979,7 +5266,7 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, next = win.first_line_start; start_tmp = next; forward_chars_with_skipping (&start_tmp, offset, - visible_only, !slice); + visible_only, !slice, FALSE); if (limit && gtk_text_iter_compare (limit, &start_tmp) > 0) @@ -4997,7 +5284,7 @@ gtk_text_iter_backward_search (const GtkTextIter *iter, } forward_chars_with_skipping (&next, offset, - visible_only, !slice); + visible_only, !slice, TRUE); if (match_end) *match_end = next; diff --git a/gtk/gtktextiter.h b/gtk/gtktextiter.h index 046d436802..af02536368 100644 --- a/gtk/gtktextiter.h +++ b/gtk/gtktextiter.h @@ -37,9 +37,10 @@ G_BEGIN_DECLS typedef enum { - GTK_TEXT_SEARCH_VISIBLE_ONLY = 1 << 0, - GTK_TEXT_SEARCH_TEXT_ONLY = 1 << 1 - /* Possible future plans: SEARCH_CASE_INSENSITIVE, SEARCH_REGEXP */ + GTK_TEXT_SEARCH_VISIBLE_ONLY = 1 << 0, + GTK_TEXT_SEARCH_TEXT_ONLY = 1 << 1, + GTK_TEXT_SEARCH_CASE_INSENSITIVE = 1 << 2, + /* Possible future plans: SEARCH_REGEXP */ } GtkTextSearchFlags; /*