Unicode word breaks

author Daniel Ehrenberg <littledan@Macintosh-103.local>

Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-103.local>

Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)
author Daniel Ehrenberg <littledan@Macintosh-103.local>
Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-103.local>
Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)
diff --git a/basis/unicode/breaks/breaks-tests.factor b/basis/unicode/breaks/breaks-tests.factor

index 39baa8f80825b04c737b3cfeb3650e6ae9aa5e8a..b91cb2b26cc3f39b2e9bfb617ccdc0b749deb9c8 100644 (file)
--- a/basis/unicode/breaks/breaks-tests.factor
+++ b/basis/unicode/breaks/breaks-tests.factor
@@ -36,4 +36,4 @@ IN: unicode.breaks.tests
      ] each ;
  
  grapheme-break-test parse-test-file [ >graphemes ] test
-! word-break-test parse-test-file [ >words ] test
+word-break-test parse-test-file [ >words ] test
diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor

index 9d2bad472442ed89e366b094fdcb3cb307d64444..5652cc2906360dac71205b22a3c7b9f7d5c478dd 100644 (file)
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -110,10 +110,10 @@ VALUE: grapheme-table
      str [
          dup quot call cut-slice
          swap , quot (>pieces)
-    ] unless-empty ;
+    ] unless-empty ; inline recursive
  
  : >pieces ( str quot -- graphemes )
-    [ (>pieces) ] { } make ;
+    [ (>pieces) ] { } make ; inline
  
  : >graphemes ( str -- graphemes )
      [ first-grapheme ] >pieces ;
@@ -139,14 +139,14 @@ to: word-break-table
  C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
  wMidNum wMidNumLet wNumeric wExtendNumLet words ;
  
-MEMO: word-break-classes ( -- table )
+: word-break-classes ( -- table ) ! Is there a way to avoid this?
      H{
-        { "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline }
-        { "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana }
-        { "ALetter" wALetter } { "MidLetter" wMidLetter }
-        { "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric }
-        { "ExtendNumLet" wExtendNumLet }
-    } [ execute ] assoc-map ;
+        { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
+        { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
+        { "ALetter" 7 } { "MidLetter" 8 }
+        { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
+        { "ExtendNumLet" 12 }
+    } ;
  
  : word-break-prop ( char -- word-break-prop )
      word-break-table interval-at
@@ -185,22 +185,49 @@ words init-table table
  [ make-word-table finish-word-table ] with-variable
  to: word-table
  
-: word-break? ( class1 class2 -- ? )
-    word-table nth nth not ;
-
-: skip? ( char -- ? )
-    word-break-prop { 4 5 } member? ; ! wExtend or wFormat
+: word-table-nth ( class1 class2 -- ? )
+    word-table nth nth ;
+
+: property-not= ( i str property -- ? )
+    pick [
+        [ ?nth ] dip swap
+        [ word-break-prop = not ] [ drop f ] if*
+    ] [ 3drop t ] if ;
+
+: format/extended? ( ch -- ? )
+    word-break-prop { 4 5 } member? ;
+
+:: walk-up ( str i -- j )
+    i 1 + str [ format/extended? not ] find-from drop
+    1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
+
+:: walk-down ( str i -- j )
+    i str [ format/extended? not ] find-last-from drop
+    1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
+
+:: word-break? ( table-entry i str -- ? )
+    table-entry {
+        { t [ f ] }
+        { f [ t ] }
+        { check-letter-after
+            [ str i walk-up str wALetter property-not= ] }
+        { check-letter-before
+            [ str i walk-down str wALetter property-not= ] }
+        { check-number-after
+            [ str i walk-up str wNumeric property-not= ] }
+        { check-number-before
+            [ str i walk-down str wNumeric property-not= ] }
+    } case ;
  
-: word-break-next ( old-class new-char -- next-class ? )
-    word-break-prop dup { 4 5 } member?
-    [ drop f ] [ tuck word-break? ] if ;
+:: word-break-next ( old-class new-char i str -- next-class ? )
+    new-char word-break-prop dup { 4 5 } member?
+    [ drop old-class dup { 1 2 3 } member? ]
+    [ old-class over word-table-nth i str word-break? ] if ;
  
-: first-word ( str -- i )
-    unclip-slice word-break-prop over
-    [ word-break-next ] find-index
+:: first-word ( str -- i )
+    str unclip-slice word-break-prop over <enum>
+    [ swap str word-break-next ] assoc-find 2drop
      nip swap length or 1+ ;
-! This must be changed to ignore format/extended chars and
-! handle symbols in the table specially
  
  : >words ( str -- words )
      [ first-word ] >pieces ;
author	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Wed, 7 Jan 2009 21:08:08 +0000 (15:08 -0600)
basis/unicode/breaks/breaks-tests.factor		patch \| blob \| history
basis/unicode/breaks/breaks.factor		patch \| blob \| history