extra/unicode/collation/collation-docs.factor

   1 USING: help.syntax help.markup strings byte-arrays ;
   2 IN: unicode.collation
   3
   4 ABOUT: "unicode.collation"
   5
   6 ARTICLE: "unicode.collation" "Unicode collation algorithm (UCA)"
   7 "The Unicode Collation Algorithm (UTS #10) forms a reasonable way to sort strings when accouting for all of the characters in Unicode. At the moment, only the default Unicode collation element table (DUCET) is used, but a more accurate collation would take locale into account. The following words are defined:"
   8 { $subsection sort-strings }
   9 { $subsection collation-key }
  10 { $subsection string<=> }
  11 { $subsection primary= }
  12 { $subsection secondary= }
  13 { $subsection tertiary= }
  14 { $subsection quaternary= } ;
  15
  16 HELP: sort-strings
  17 { $values { "strings" "a sequence of strings" } { "sorted" "the strings in DUCET order" } }
  18 { $description "This word takes a sequence of strings and sorts them according to the UCA, using code point order as a tie-breaker." } ;
  19
  20 HELP: collation-key
  21 { $values { "string" string } { "key" byte-array } }
  22 { $description "This takes a string and gives a representation of the collation key, which can be compared with <=>" } ;
  23
  24 HELP: string<=>
  25 { $values { "str1" string } { "str2" string } { "<=>" "one of +lt+, +gt+ or +eq+" } }
  26 { $description "This word takes two strings and compares them using the UCA with the DUCET, using code point order as a tie-breaker." } ;
  27
  28 HELP: primary=
  29 { $values { "str1" string } { "str2" string } { "?" "t or f" } }
  30 { $description "This checks whether the first level of collation is identical. This is the least specific kind of equality test. In Latin script, it can be understood as ignoring case, punctuation and accent marks." } ;
  31
  32 HELP: secondary=
  33 { $values { "str1" string } { "str2" string } { "?" "t or f" } }
  34 { $description "This checks whether the first two levels of collation are equal. For Latin script, this means accent marks are significant again, and it is otherwise similar to primary=." } ;
  35
  36 HELP: tertiary=
  37 { $values { "str1" string } { "str2" string } { "?" "t or f" } }
  38 { $description "Along the same lines as secondary=, but case is significant." } ;
  39
  40 HELP: quaternary=
  41 { $values { "str1" string } { "str2" string } { "?" "t or f" } }
  42 { $description "This is similar to tertiary= but it makes punctuation significant again, while still leaving out things like null bytes and Hebrew vowel marks, which mean absolutely nothing in collation." } ;