From 668132077f29ff28466209526cd181bd5f842ada Mon Sep 17 00:00:00 2001 From: Andrzej Date: Mon, 30 Apr 2012 18:28:20 +0900 Subject: [PATCH 1/3] Fixed sorting of utf8 file names (maybe) --- thunar/thunar-file.c | 12 ++++++------ 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/thunar/thunar-file.c b/thunar/thunar-file.c index 9509600..0cc3124 100644 --- a/thunar/thunar-file.c +++ b/thunar/thunar-file.c @@ -3296,8 +3296,8 @@ thunar_file_compare_by_name (const ThunarFile *file_a, const gchar *bp; const gchar *filename_a; const gchar *filename_b; - guchar ac; - guchar bc; + gunichar ac; + gunichar bc; #ifdef G_ENABLE_DEBUG /* probably too expensive to do the instance check every time @@ -3322,8 +3322,8 @@ thunar_file_compare_by_name (const ThunarFile *file_a, for (;; ++ap, ++bp) { /* check if the characters differ or we have a non-ASCII char */ - ac = *((const guchar *)ap); - bc = *((const guchar *)bp); + ac = *((const gunichar *)ap); + bc = *((const gunichar *)bp); if (ac != bc || ac == 0 || ac > 127) break; } @@ -3347,8 +3347,8 @@ thunar_file_compare_by_name (const ThunarFile *file_a, for (;; ++ap, ++bp) { /* check if the characters differ or we have a non-ASCII char */ - ac = *((const guchar *)ap); - bc = *((const guchar *)bp); + ac = *((const gunichar *)ap); + bc = *((const gunichar *)bp); if (g_ascii_tolower (ac) != g_ascii_tolower (bc) || ac == 0 || ac > 127) break; } -- 1.7.5.4 From 3052997d983d15c2ccee9f51b2110a905990b9cd Mon Sep 17 00:00:00 2001 From: Andrzej Date: Tue, 1 May 2012 02:22:48 +0900 Subject: [PATCH 2/3] More sorting fixes. - Making sure that 'thunar_file_compare_by_name' doesn't return 0. (if filenames after lowering case are same, we switch back to case sensitive comparison) - Fixed an inconsistent ordering using strcoll. Strcoll doesn't just compare the first different character, it uses the whole string. Before, we were giving it the whole string in case sensitive mode and only first differing character in the case insensitive more, which resulted in very results. Sorting order is still a bit weird in some cases but there is not much we can do about it as long as we rely on strcoll. --- thunar/thunar-file.c | 78 ++++++++++++++++++++++++++++++++++--------------- 1 files changed, 54 insertions(+), 24 deletions(-) diff --git a/thunar/thunar-file.c b/thunar/thunar-file.c index 0cc3124..76f4d75 100644 --- a/thunar/thunar-file.c +++ b/thunar/thunar-file.c @@ -3311,20 +3311,20 @@ thunar_file_compare_by_name (const ThunarFile *file_a, filename_a = thunar_file_get_display_name (file_a); filename_b = thunar_file_get_display_name (file_b); - /* start at the beginning of both strings */ - ap = filename_a; - bp = filename_b; - /* check if we should ignore case */ - if (G_LIKELY (case_sensitive)) + if (G_LIKELY (case_sensitive == FALSE)) { + /* start at the beginning of both strings */ + ap = filename_a; + bp = filename_b; + /* try simple (fast) ASCII comparison first */ for (;; ++ap, ++bp) { /* check if the characters differ or we have a non-ASCII char */ ac = *((const gunichar *)ap); bc = *((const gunichar *)bp); - if (ac != bc || ac == 0 || ac > 127) + if (g_ascii_tolower (ac) != g_ascii_tolower (bc) || ac == 0 || ac > 127) break; } @@ -3336,24 +3336,33 @@ thunar_file_compare_by_name (const ThunarFile *file_a, /* check if characters differ or end of string */ ac = g_utf8_get_char (ap); bc = g_utf8_get_char (bp); - if (ac != bc || ac == 0) + if (g_unichar_tolower (ac) != g_unichar_tolower (bc) || ac == 0) break; } } } - else + + /* if both strings are equal after case insensitive comparison we switch to case sensitivity */ + case_sensitive = (case_sensitive || (ac == bc && g_unichar_tolower (ac) == g_unichar_tolower (bc))); + + /* if case sensitive */ + if (G_UNLIKELY (case_sensitive == TRUE)) { - /* try simple (fast) ASCII comparison first (case-insensitive!) */ + /* start at the beginning of both strings */ + ap = filename_a; + bp = filename_b; + + /* try simple (fast) ASCII comparison first */ for (;; ++ap, ++bp) { /* check if the characters differ or we have a non-ASCII char */ ac = *((const gunichar *)ap); bc = *((const gunichar *)bp); - if (g_ascii_tolower (ac) != g_ascii_tolower (bc) || ac == 0 || ac > 127) + if (ac != bc || ac == 0 || ac > 127) break; } - /* fallback to Unicode comparison (case-insensitive!) */ + /* fallback to Unicode comparison */ if (G_UNLIKELY (ac > 127 || bc > 127)) { for (;; ap = g_utf8_next_char (ap), bp = g_utf8_next_char (bp)) @@ -3361,19 +3370,18 @@ thunar_file_compare_by_name (const ThunarFile *file_a, /* check if characters differ or end of string */ ac = g_utf8_get_char (ap); bc = g_utf8_get_char (bp); - if (g_unichar_tolower (ac) != g_unichar_tolower (bc) || ac == 0) + if (ac != bc || ac == 0) break; } } } - /* if both strings are equal, we're done */ - if (G_UNLIKELY (ac == bc - || (!case_sensitive - && g_unichar_tolower (ac) == g_unichar_tolower (bc)))) - { - return 0; - } + /* printf ("files: %s\t%s\t%x\t%x\n", ap, bp, ac, bc); */ + +#ifdef G_ENABLE_DEBUG + /* if both strings are equal, we're screwed (two same filenames in one directory?) */ + _thunar_return_val_if_fail (ac != bc, 0); +#endif /* check if one of the characters that differ is a digit */ if (G_UNLIKELY (g_ascii_isdigit (ac) || g_ascii_isdigit (bc))) @@ -3415,26 +3423,48 @@ thunar_file_compare_by_name (const ThunarFile *file_a, */ if (G_LIKELY (case_sensitive)) { + /* printf ("-+-+-: %s\t%s\t%x\t%x\t%d\n", ap, bp, ac, bc, strcoll (ap, bp)); */ return strcoll (ap, bp); } else { + gint result; + /* we use a trick here, so we don't need to allocate * and transform the two strings completely first (8 * byte for each buffer, so all compilers should align * them properly) */ - gchar abuf[8]; - gchar bbuf[8]; + gchar abuf[8] = {'\0',}; + gchar bbuf[8] = {'\0',}; + gchar *ap2; + gchar *bp2; /* transform the unicode chars to strings and * make sure the strings are nul-terminated. */ - abuf[g_unichar_to_utf8 (g_unichar_tolower(ac), abuf)] = '\0'; - bbuf[g_unichar_to_utf8 (g_unichar_tolower(bc), bbuf)] = '\0'; + abuf[g_unichar_to_utf8 (g_unichar_tolower (ac), abuf)] = '\0'; + bbuf[g_unichar_to_utf8 (g_unichar_tolower (bc), bbuf)] = '\0'; + + /* strcoll doesn't just look at the first differing character of the string */ + /* so, in order to match behavior of strcoll (ap, bp) we must add the rest */ + /* of the string. For simplicity and efficiency we only change the case */ + /* of the first character. */ + ap2 = g_strconcat (abuf, g_utf8_next_char (ap), NULL); + bp2 = g_strconcat (bbuf, g_utf8_next_char (bp), NULL); + + /* printf ("-----: %s\t%s\t%x\t%x\t%d\t%d\n", ap2, bp2, ac, bc, strcoll (ap2, bp2), strcoll (ap, bp)); */ /* compare the unicode chars (as strings) */ - return strcoll (abuf, bbuf); + result = strcoll (ap2, bp2); + g_free (ap2); + g_free (bp2); + if (result == 0) + /* if same, repeat with the original string */ + /* this is handled earlier so it should never occur */ + return strcoll (ap, bp); + else + return result; } } #endif -- 1.7.5.4 From d1740e09c7a99522268b2d321c3efc0cb7799459 Mon Sep 17 00:00:00 2001 From: Andrzej Date: Tue, 1 May 2012 02:52:25 +0900 Subject: [PATCH 3/3] Filename sorting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a check for filename length in utf-8 mode. Before 'ああ' < 'あ' --- thunar/thunar-file.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/thunar/thunar-file.c b/thunar/thunar-file.c index 76f4d75..2b4d9a5 100644 --- a/thunar/thunar-file.c +++ b/thunar/thunar-file.c @@ -3418,6 +3418,12 @@ thunar_file_compare_by_name (const ThunarFile *file_a, #ifdef HAVE_STRCOLL if ((ac > 127 || bc > 127) && g_get_charset (NULL)) { + /* check if any of the strings is shorter */ + if (ac == 0) + return -1; + if (bc == 0) + return 1; + /* case-sensitive is easy, case-insensitive is expensive, * but we use a simple optimization to make it fast. */ -- 1.7.5.4