unicode.collation: Fix all the tests.

author Doug Coleman <doug.coleman@gmail.com>

Tue, 30 Jul 2019 07:07:34 +0000 (02:07 -0500)

committer Doug Coleman <doug.coleman@gmail.com>

Tue, 30 Jul 2019 07:12:41 +0000 (02:12 -0500)
author Doug Coleman <doug.coleman@gmail.com>
Tue, 30 Jul 2019 07:07:34 +0000 (02:07 -0500)
committer Doug Coleman <doug.coleman@gmail.com>
Tue, 30 Jul 2019 07:12:41 +0000 (02:12 -0500)
diff --git a/basis/unicode/collation/collation-tests.factor b/basis/unicode/collation/collation-tests.factor

index faa5a67ede761bb955272fc0e75d2a672c6bd66e..5f7705ffa396d765fd5050ea125834dca6baa0fc 100644 (file)
--- a/basis/unicode/collation/collation-tests.factor
+++ b/basis/unicode/collation/collation-tests.factor
@@ -1,6 +1,6 @@
-USING: arrays assocs fry grouping io.encodings.utf8 io.files
-kernel math math.order math.parser sequences splitting
-strings tools.test unicode ;
+USING: arrays assocs fry grouping hash-sets io.encodings.utf8
+io.files kernel math math.order math.parser sequences sets
+splitting strings tools.test unicode ;
  IN: unicode.collation.tests
  
  : test-equality ( str1 str2 -- ? ? ? ? )
@@ -37,9 +37,28 @@ IN: unicode.collation.tests
          [ " " split harvest [ hex> ] map ] map
      ] bi* 2array ;
  
+! These tests actually would pass if I didn't fix up
+! the ducet table for Tibetan. It took me way too long to realize
+! that the Unicode committee recommends fixing Tibetan collation
+! yet ships tests that collation fails if you fix it.
+! (Specifically the ducet entries for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 }
+! cause these tests to fail)
+: xfailed-collation-tests ( -- seq )
+    HS{
+        { 3958 3953 820 }
+        { 4018 820 3953 3968 }
+        { 4018 820 3968 3953 }
+        { 4018 820 3969 }
+        { 3960 3953 820 }
+        { 4019 820 3953 3968 }
+        { 4019 820 3968 3953 }
+        { 4019 3953 820 3968 }
+    } ;
+
  : parse-collation-test-weights ( -- weights )
      collation-test-lines
-    [ line>test-weights ] map ;
+    [ line>test-weights ] map
+    [ first xfailed-collation-tests in? ] reject ;
  
  : calculate-collation ( chars collation -- collation-calculated collation-answer )
      [ >string collation-key/nfd drop ] [ { 0 } join ] bi* ;
@@ -52,42 +71,18 @@ IN: unicode.collation.tests
  
  { { } } [
      parse-collation-test-shifted
-    2 clump
-    [ string<=> { +lt+ +eq+ } member? ] assoc-reject
-] unit-test
-
-! FIXME: ducet table is wrong
-! Fixed by fixing ducet table
-! { +lt+ } [ { 4019 98 } { 4019 3953 1 3968 97 } [ >string ] bi@ string<=> ] unit-test
-
-{ +lt+ } [ { 4018 820 3969 } { 3959 33 } [ >string ] bi@ string<=> ] unit-test
-{ +lt+ } [ { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ string<=> ] unit-test
-
-
-{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 3958 3953 820 } >string collation-key/nfd drop ] unit-test
+    2 clump >hash-set
  
-{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 4018 820 3953 3968 } >string collation-key/nfd drop ] unit-test
+    ! Remove these two expected-fail Tibetan collation comparison tests
+    ! They are bad tests once you fix up the ducet table with { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 }
+    { 4018 820 3969 } { 3959 33 } [ >string ] bi@ 2array
+    { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ 2array
+    2array >hash-set diff members
  
-! { { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-! [ { 0x0FB2 0x0334 0x0F80 0x0F71 } >string collation-key/nfd drop ] unit-test
-
-{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 4018 820 3969 } >string collation-key/nfd drop ] unit-test
-
-{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 3960 3953 820 } >string collation-key/nfd drop ] unit-test
-
-{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 4019 820 3953 3968 } >string collation-key/nfd drop ] unit-test
-
-{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 4019 820 3968 3953 } >string collation-key/nfd drop ] unit-test
-
-{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } }
-[ { 4019 3953 820 3968 } >string collation-key/nfd drop ] unit-test
+    [ string<=> { +lt+ +eq+ } member? ] assoc-reject
+] unit-test
  
-{ { 12722 12741 12744 7817 0 32 32 32 32 0 2 2 2 2 0 65535 65535 65535 65535 } }
-[ { 4019 3953 1 3968 97 } >string collation-key/nfd drop ] unit-test
-! { 0xfb3 0x0f71 0x0334 0x0f80 }
-\ No newline at end of file
+! XXX: Once again, these tests pass if you don't
+! fix up the ducet table for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 }
+! { +lt+ } [ { 4018 820 3969 } { 3959 33 } [ >string ] bi@ string<=> ] unit-test
+! { +lt+ } [ { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ string<=> ] unit-test
+\ No newline at end of file
diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor

index 97bb8f486f85a5b1ee2b5d86644740a4499fa9be..b6c4941b9d7fcb4315d24f338795a6271f83e631 100644 (file)
--- a/basis/unicode/collation/collation.factor
+++ b/basis/unicode/collation/collation.factor
@@ -33,7 +33,15 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ;
  "vocab:unicode/UCA/allkeys.txt" parse-ducet ducet set-global
  
  ! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET
-: fixup-ducet ( -- )
+! WF5 - Well-formedness 5 condition:
+! https://www.unicode.org/reports/tr10/tr10-41.html#WF5
+!    { "0CC6" "0CC2" "0CD5" } ! 0CD5 is not a non-starter, don't add 2-gram "0CC6" "0CC2"to ducet
+!    { "0DD9" "0DCF" "0DCA" } ! already in allkeys.txt file
+!    { "0FB2" "0F71" "0F80" } ! added below
+!    { "0FB3" "0F71" "0F80" } ! added below
+! This breaks the unicode tests that ship in CollationTest_SHIFTED.txt
+! but it's supposedly more correct.
+: fixup-ducet-for-tibetan ( -- )
      {
          {
              { 0x0FB2 0x0F71 } ! CE(0FB2) CE(0F71)
@@ -188,9 +196,11 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ;
          }
      } ducet get-global '[ swap >string _ set-at ] assoc-each ;
  
-! Add a few missing ducet values for Tibetan
+! These values actually break the collation unit tests in CollationTest_SHIFTED.txt
+! So we disable those tests in favor of supposedly better collation for Tibetan.
  ! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET
-fixup-ducet
+
+fixup-ducet-for-tibetan
  
  : tangut-block? ( char -- ? )
      ! Tangut Block, Tangut Components Block
author	Doug Coleman <doug.coleman@gmail.com>
	Tue, 30 Jul 2019 07:07:34 +0000 (02:07 -0500)
committer	Doug Coleman <doug.coleman@gmail.com>
	Tue, 30 Jul 2019 07:12:41 +0000 (02:12 -0500)
basis/unicode/collation/collation-tests.factor		patch \| blob \| history
basis/unicode/collation/collation.factor		patch \| blob \| history