Making collation better

author Daniel Ehrenberg <microdan@gmail.com>

Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)

committer Daniel Ehrenberg <microdan@gmail.com>

Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)
author Daniel Ehrenberg <microdan@gmail.com>
Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)
committer Daniel Ehrenberg <microdan@gmail.com>
Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)
diff --git a/extra/unicode/collation/collation-tests.factor b/extra/unicode/collation/collation-tests.factor

index cabe09b88f205f6df30c2b35334a31ff5b300826..33c27984a69dd2845c3ecc68d7a66503ea4e2da7 100755 (executable)
--- a/extra/unicode/collation/collation-tests.factor
+++ b/extra/unicode/collation/collation-tests.factor
@@ -8,14 +8,16 @@ IN: unicode.collation.tests
      [ ";" split1 drop " " split [ hex> ] "" map-as ] map ;\r
  \r
  : test-two ( str1 str2 -- )\r
-    [ +lt+ ] -rot [ string<=> ] 2curry unit-test ;  \r
-\r
-: test parse-test 2 <clumps> [ test-two ] assoc-each ;\r
+    [ +lt+ ] -rot [ string<=> ] 2curry unit-test ;\r
  \r
  : find-failure\r
      parse-test dup 2 <clumps>\r
      [ string<=> +lt+ = not ] assoc-find drop ;\r
  \r
+: (find-failure)\r
+    dup 2 <clumps>\r
+    [ string<=> +lt+ = not ] assoc-find drop ;\r
+\r
  : failures\r
      parse-test dup 2 <clumps>\r
      [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;\r
diff --git a/extra/unicode/collation/collation.factor b/extra/unicode/collation/collation.factor

index 2147b3f02ce9ba1dca4242e92770fd12ad749f36..786693158fc5771ef555ae19eef9abc2522c16ff 100755 (executable)
--- a/extra/unicode/collation/collation.factor
+++ b/extra/unicode/collation/collation.factor
@@ -1,7 +1,7 @@
  USING: sequences io.files io.encodings.ascii kernel values\r
  splitting accessors math.parser ascii io assocs strings math\r
  namespaces sorting combinators math.order arrays\r
-unicode.normalize ;\r
+unicode.normalize unicode.data combinators.lib locals ;\r
  IN: unicode.collation\r
  \r
  VALUE: ducet\r
@@ -33,14 +33,39 @@ ascii <file-reader> parse-ducet \ ducet set-value
  : derive-weight ( char -- weight )\r
      ! This should check Noncharacter_Code_Point\r
      ! If yes, then ignore the character\r
-    ! otherwise, apply derivation formula\r
+    ! otherwise, apply derivation formula with the right base\r
      drop { } ;\r
  \r
-: string>weights ( string -- weights )\r
-    ! This should actually look things up with\r
-    ! multichar collation elements\r
-    ! Also, do weight derivation for things not in DUCET\r
-    [ dup 1string ducet at [ ] [ derive-weight ] ?if ]\r
+: last ( -- char )\r
+    building get empty? [ 0 ] [ building get peek peek ] if ;\r
+\r
+: blocked? ( char -- ? )\r
+    combining-class [\r
+        last combining-class =\r
+    ] [ last combining-class ] if* ;\r
+\r
+: possible-bases ( -- slice-of-building )\r
+    building get dup [ first combining-class not ] find-last\r
+    drop [ 0 ] unless* tail-slice ;\r
+\r
+:: ?combine ( char slice i -- ? )\r
+    [let | str [ i slice nth char suffix ] |\r
+        str ducet key? dup\r
+        [ str i slice set-nth ] when\r
+    ] ;\r
+\r
+: add ( char -- )\r
+    dup blocked? [ 1string , ] [\r
+        dup possible-bases dup length\r
+        [ ?combine ] 2with contains?\r
+        [ drop ] [ 1string , ] if\r
+    ] if ;\r
+\r
+: string>graphemes ( string -- graphemes )\r
+    [ [ add ] each ] { } make ;\r
+\r
+: graphemes>weights ( graphemes -- weights )\r
+    [ dup ducet at [ ] [ derive-weight ] ?if ]\r
      { } map-as concat ;\r
  \r
  : append-weights ( weights quot -- )\r
@@ -65,8 +90,6 @@ ascii <file-reader> parse-ducet \ ducet set-value
      [ zero? ] tri@ and and ;\r
  \r
  : filter-ignorable ( weights -- weights' )\r
-    ! Filters primary-ignorables which follow variable weighteds\r
-    ! and all completely-ignorables\r
      >r f r> [\r
          tuck primary>> zero? and\r
          [ swap ignorable?>> or ]\r
@@ -74,7 +97,8 @@ ascii <file-reader> parse-ducet \ ducet set-value
      ] filter nip ;\r
  \r
  : collation-key ( string -- key )\r
-    nfd string>weights filter-ignorable weights>bytes ;\r
+    nfd string>graphemes graphemes>weights\r
+    filter-ignorable weights>bytes ;\r
  \r
  : compare-collation ( {str1,key} {str2,key} -- <=> )\r
      2dup [ second ] bi@ <=> dup +eq+ =\r
@@ -87,3 +111,17 @@ ascii <file-reader> parse-ducet \ ducet set-value
  \r
  : string<=> ( str1 str2 -- <=> )\r
      [ dup collation-key 2array ] bi@ compare-collation ;\r
+\r
+! Fix up table for long contractions\r
+: help-one ( assoc key -- )\r
+    ! Does this need to be more general?\r
+    2 head 2dup swap key? [ 2drop ] [\r
+        [ [ 1string swap at ] with { } map-as concat ]\r
+        [ swap set-at ] 2bi\r
+    ] if ;\r
+\r
+: insert-helpers ( assoc -- )\r
+    dup keys [ length 3 >= ] filter\r
+    [ help-one ] with each ;\r
+\r
+ducet insert-helpers\r
diff --git a/extra/yahoo/yahoo.factor b/extra/yahoo/yahoo.factor

old mode 100644 (file)

new mode 100755 (executable)

index 89f937d..214ad04
--- a/extra/yahoo/yahoo.factor
+++ b/extra/yahoo/yahoo.factor
@@ -1,7 +1,7 @@
  ! Copyright (C) 2006 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
  USING: http.client xml xml.utilities kernel sequences
-namespaces http math.parser help math.order ;
+namespaces http math.parser help math.order locals ;
  IN: yahoo
  
  TUPLE: result title url summary ;
@@ -16,14 +16,21 @@ C: <result> result
      ] map ;
  
  : yahoo-url ( -- str )
-    "http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=Factor-search&query=" ;
+    "http://search.yahooapis.com/WebSearchService/V1/webSearch" ;
  
-: query ( search num -- url )
+:: query ( search num appid -- url )
      [
          yahoo-url %
-        swap url-encode %
-        "&results=" % #
+        "?appid=" % appid %
+        "&query=" % search url-encode %
+        "&results=" % num #
      ] "" make ;
  
-: search-yahoo ( search num -- seq )
+: factor-id
+    "fRrVAKzV34GDyeRw6bUHDhEWHRedwfOC7e61wwXZLgGF80E67spxdQXuugBe2pgIevMmKwA-" ;
+
+: search-yahoo/id ( search num id -- seq )
      query http-get string>xml parse-yahoo ;
+
+: search-yahoo ( search num -- seq )
+    factor-id search-yahoo/id ;
author	Daniel Ehrenberg <microdan@gmail.com>
	Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)
committer	Daniel Ehrenberg <microdan@gmail.com>
	Sat, 24 May 2008 17:17:08 +0000 (12:17 -0500)
extra/unicode/collation/collation-tests.factor		patch \| blob \| history
extra/unicode/collation/collation.factor		patch \| blob \| history
extra/yahoo/yahoo.factor	[changed mode: 0644->0755]	patch \| blob \| history