Docs for collation

author Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>

Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)

committer Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>

Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)
author Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>
Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)
committer Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>
Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)
diff --git a/extra/unicode/collation/collation-docs.factor b/extra/unicode/collation/collation-docs.factor

index 23538229a446bda16144480e128d2a707d61fdbd..0e92042ddd619f78dcd0cbd44b7304f1cb2f817d 100644 (file)
--- a/extra/unicode/collation/collation-docs.factor
+++ b/extra/unicode/collation/collation-docs.factor
@@ -1,7 +1,42 @@
-USING: help.syntax help.markup ;
+USING: help.syntax help.markup strings byte-arrays ;
  IN: unicode.collation
  
  ABOUT: "unicode.collation"
  
  ARTICLE: "unicode.collation" "Unicode collation algorithm"
-"The Unicode Collation Algorithm (UTS #10) forms a reasonable way to sort strings when accouting for all of the characters in Unicode." ;
+"The Unicode Collation Algorithm (UTS #10) forms a reasonable way to sort strings when accouting for all of the characters in Unicode. At the moment, only the default Unicode collation element table (DUCET) is used, but a more accurate collation would take locale into account. The following words are defined:"
+{ $subsection sort-strings }
+{ $subsection collation-key }
+{ $subsection string<=> }
+{ $subsection primary= }
+{ $subsection secondary= }
+{ $subsection tertiary= }
+{ $subsection quaternary= } ;
+
+HELP: sort-strings
+{ $values { "strings" "a sequence of strings" } { "sorted" "the strings in DUCET order" } }
+{ $description "This word takes a sequence of strings and sorts them according to the UCA, using code point order as a tie-breaker." } ;
+
+HELP: collation-key
+{ $values { "string" string } { "key" byte-array } }
+{ $description "This takes a string and gives a representation of the collation key, which can be compared with <=>" } ;
+
+HELP: string<=>
+{ $values { "str1" string } { "str2" string } { "<=>" "one of +lt+, +gt+ or +eq+" } }
+{ $description "This word takes two strings and compares them using the UCA with the DUCET, using code point order as a tie-breaker." } ;
+
+HELP: primary=
+{ $values { "str1" string } { "str2" string } { "?" "t or f" } }
+{ $description "This checks whether the first level of collation is identical. This is the least specific kind of equality test. In Latin script, it can be understood as ignoring case, punctuation and accent marks." } ;
+
+HELP: secondary=
+{ $values { "str1" string } { "str2" string } { "?" "t or f" } }
+{ $description "This checks whether the first two levels of collation are equal. For Latin script, this means accent marks are significant again, and it is otherwise similar to primary=." } ;
+
+HELP: tertiary=
+{ $values { "str1" string } { "str2" string } { "?" "t or f" } }
+{ $description "Along the same lines as secondary=, but case is significant." } ;
+
+HELP: quaternary=
+{ $values { "str1" string } { "str2" string } { "?" "t or f" } }
+{ $description "This is similar to tertiary= but it makes punctuation significant again, while still leaving out things like null bytes and Hebrew vowel marks, which mean absolutely nothing in collation." } ;
diff --git a/extra/unicode/collation/collation.factor b/extra/unicode/collation/collation.factor

index b12a10709ef35284f234e185f0ec2047f32d1df3..441339d677952282579ed357b78e05ea0e9eef43 100755 (executable)
--- a/extra/unicode/collation/collation.factor
+++ b/extra/unicode/collation/collation.factor
@@ -6,6 +6,7 @@ unicode.syntax macros sequences.deep words unicode.breaks
  quotations ;\r
  IN: unicode.collation\r
  \r
+<PRIVATE\r
  VALUE: ducet\r
  \r
  TUPLE: weight primary secondary tertiary ignorable? ;\r
@@ -115,6 +116,7 @@ ducet insert-helpers
              [ [ variable-weight ] each ]\r
          } cleave\r
      ] { } make ;\r
+PRIVATE>\r
  \r
  : completely-ignorable? ( weight -- ? )\r
      [ primary>> ] [ secondary>> ] [ tertiary>> ] tri\r
@@ -131,11 +133,13 @@ ducet insert-helpers
      nfd string>graphemes graphemes>weights\r
      filter-ignorable weights>bytes ;\r
  \r
+<PRIVATE\r
  : insensitive= ( str1 str2 levels-removed -- ? )\r
      [\r
          swap collation-key swap\r
          [ [ 0 = not ] right-trim but-last ] times\r
      ] curry bi@ = ;\r
+PRIVATE>\r
  \r
  : primary= ( str1 str2 -- ? )\r
      3 insensitive= ;\r
@@ -149,12 +153,14 @@ ducet insert-helpers
  : quaternary= ( str1 str2 -- ? )\r
      0 insensitive= ;\r
  \r
+<PRIVATE\r
  : compare-collation ( {str1,key} {str2,key} -- <=> )\r
      2dup [ second ] bi@ <=> dup +eq+ =\r
      [ drop <=> ] [ 2nip ] if ;\r
  \r
  : w/collation-key ( str -- {str,key} )\r
      dup collation-key 2array ;\r
+PRIVATE>\r
  \r
  : sort-strings ( strings -- sorted )\r
      [ w/collation-key ] map\r
author	Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>
	Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)
committer	Daniel Ehrenberg <littledan@pool-226-177.res.carleton.edu>
	Sun, 1 Jun 2008 16:24:17 +0000 (11:24 -0500)
extra/unicode/collation/collation-docs.factor		patch \| blob \| history
extra/unicode/collation/collation.factor		patch \| blob \| history