]> gitweb.factorcode.org Git - factor.git/blobdiff - basis/unicode/breaks/breaks.factor
factor: trim some using lists
[factor.git] / basis / unicode / breaks / breaks.factor
index 1aa5ec3000059ea6591c5dbc870aa47cf9e53de3..f69b0945feab23dad5982c8297f61c2e28ea3a53 100644 (file)
@@ -1,13 +1,38 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs combinators fry interval-maps
-kernel literals locals math namespaces parser sequences
-simple-flat-file unicode.categories unicode.data
-unicode.normalize.private words words.constant ;
+USING: accessors arrays combinators interval-maps kernel
+literals math namespaces sequences simple-flat-file
+unicode.categories unicode.data unicode.normalize.private words ;
 IN: unicode.breaks
 
 <PRIVATE
 
+<<
+
+:: load-interval-file-for ( filename n key -- table )
+    filename load-data-file [ n swap nth key = ] filter
+    intern-values expand-ranges ;
+
+>>
+
+CONSTANT: emoji-modifier-table $[
+    "resource:basis/unicode/UCD/emoji/emoji-data.txt"
+    1 "Emoji_Modifier" load-interval-file-for
+]
+
+CONSTANT: extended-pictographic-table $[
+    "resource:basis/unicode/UCD/emoji/emoji-data.txt"
+    1 "Extended_Pictographic" load-interval-file-for
+]
+
+CONSTANT: spacing-mark-exceptions-table $[
+    {
+        0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
+        0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
+        0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
+    } <interval-set>
+]
+
 ! Grapheme breaks
 <<
 CONSTANT: Any 0
@@ -22,7 +47,12 @@ CONSTANT: CR 8
 CONSTANT: LF 9
 CONSTANT: SpacingMark 10
 CONSTANT: Prepend 11
-CONSTANT: graphemes 12
+CONSTANT: ZWJ 12
+CONSTANT: Extended_Pictographic 13
+CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
+CONSTANT: Regional_Indicator(even) 15
+CONSTANT: Regional_Indicator(odd) 16
+CONSTANT: graphemes 17
 
 : jamo-class ( ch -- class )
     dup initial? [ drop L ]
@@ -31,36 +61,78 @@ CONSTANT: graphemes 12
 : hangul-class ( ch -- class )
     hangul-base - 0x1C mod zero? LV LVT ? ;
 
+CATEGORY: extend
+    Me Mn |
+    "Other_Grapheme_Extend" property? ;
+
 CATEGORY: grapheme-control Zl Zp Cc Cf ;
+
 : control-class ( ch -- class )
     {
-        { CHAR: \r [ CR ] }
-        { CHAR: \n [ LF ] }
-        { 0x200C [ Extend ] }
-        { 0x200D [ Extend ] }
+        { [ dup CHAR: \r = ]  [ drop CR ] }
+        { [ dup CHAR: \n = ] [ drop LF ] }
+        { [ dup 0x200C = ] [ drop Extend ] }
+        { [ dup 0x200D = ] [ drop ZWJ ] }
+        { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
         [ drop Control ]
-    } case ;
-
-CATEGORY: extend
-    Me Mn |
-    "Other_Grapheme_Extend" property? ;
+    } cond ;
 
 : loe? ( ch -- ? )
     "Logical_Order_Exception" property? ;
 
 CATEGORY: spacing Mc ;
 
-: grapheme-class ( ch -- class )
+: regional? ( ch -- ? )
+    "Regional_Indicator" property? ;
+>>
+
+: modifier? ( ch -- ? )
+    emoji-modifier-table interval-key? ; inline
+
+:: grapheme-class ( str -- class )
+    str last
     {
         { [ dup jamo? ] [ jamo-class ] }
         { [ dup hangul? ] [ hangul-class ] }
-        { [ dup grapheme-control? ] [ control-class ] }
+        { [ dup grapheme-control? ] [
+              control-class dup ZWJ = [
+                  drop
+                  str unclip-last-slice drop dup [
+                      [ extend? ]
+                      [ control-class Extend = ]
+                      [ modifier? ]
+                      tri or or not
+                  ] find-last drop [ swap ?nth ] [ last ] if*
+                  extended-pictographic-table interval-key? [
+                      (Extended_Pictographic-Extend*-)ZWJ
+                  ] [ ZWJ ] if
+              ] when
+          ] }
         { [ dup extend? ] [ drop Extend ] }
-        { [ dup spacing? ] [ drop SpacingMark ] }
-        { [ loe? ] [ Prepend ] }
-        [ Any ]
+        { [ dup modifier? ] [ drop Extend ] }
+        { [ dup spacing? ] [
+               spacing-mark-exceptions-table
+               interval-key? [ Any ] [ SpacingMark ] if ] }
+        { [ dup loe? ] [ drop Prepend ] }
+        { [ dup regional? ] [
+              drop
+              f :> ri-even?!
+              str unclip-last-slice drop [
+                  regional? [ ri-even? not ri-even?! f ] [ t ] if
+              ] find-last 2drop
+              ri-even? [
+                  Regional_Indicator(even)
+              ] [
+                  Regional_Indicator(odd)
+              ] if
+          ] }
+        { [ dup extended-pictographic-table interval-key? ] [
+              drop Extended_Pictographic
+          ] }
+        [ drop Any ]
     } cond ;
 
+<<
 : init-table ( size -- table )
     dup [ f <array> ] curry replicate ;
 
@@ -83,22 +155,24 @@ SYMBOL: table
 : disconnect ( class1 class2 -- ) 0 set-table ;
 
 : make-grapheme-table ( -- )
-    { CR } { LF } connect
-    { Control CR LF } graphemes iota disconnect
-    graphemes iota { Control CR LF } disconnect
-    { L } { L V LV LVT } connect
-    { LV V } { V T } connect
-    { LVT T } { T } connect
-    graphemes iota { Extend } connect
-    graphemes iota { SpacingMark } connect
-    { Prepend } graphemes iota connect ;
-
-"grapheme-table" create-word-in
-graphemes init-table table
-[ make-grapheme-table finish-table ] with-variable
-define-constant
+    { CR } { LF } connect                                                       ! GB3
+    { Control CR LF } graphemes <iota> disconnect                               ! GB4
+    graphemes <iota> { Control CR LF } disconnect                               ! GB5
+    { L } { L V LV LVT } connect                                                ! GB6
+    { LV V } { V T } connect                                                    ! GB7
+    { LVT T } { T } connect                                                     ! GB8
+    graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
+    graphemes <iota> { SpacingMark } connect                                    ! GB9a
+    { Prepend } graphemes <iota> connect                                        ! GB9b
+    { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
+    { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
 >>
 
+CONSTANT: grapheme-table $[
+    graphemes init-table table
+    [ make-grapheme-table finish-table ] with-variable
+]
+
 : grapheme-break? ( class1 class2 -- ? )
     grapheme-table nth nth not ;
 
@@ -109,63 +183,126 @@ CONSTANT: wCR 1
 CONSTANT: wLF 2
 CONSTANT: wNewline 3
 CONSTANT: wExtend 4
-CONSTANT: wFormat 5
-CONSTANT: wKatakana 6
-CONSTANT: wALetter 7
-CONSTANT: wMidLetter 8
-CONSTANT: wMidNum 9
-CONSTANT: wMidNumLet 10
-CONSTANT: wNumeric 11
-CONSTANT: wExtendNumLet 12
-CONSTANT: unicode-words 13
-
-! Is there a way to avoid this?
-CONSTANT: word-break-classes H{
-    { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
-    { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
-    { "ALetter" 7 } { "MidLetter" 8 }
-    { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
-    { "ExtendNumLet" 12 }
-}
-
-"word-break-table" create-word-in
-"vocab:unicode/data/WordBreakProperty.txt"
-load-interval-file dup array>>
-[ 2 swap [ word-break-classes at ] change-nth ] each
-define-constant
+CONSTANT: wZWJ 5
+CONSTANT: wRegional_Indicator 6
+CONSTANT: wFormat 7
+CONSTANT: wKatakana 8
+CONSTANT: wHebrew_Letter 9
+CONSTANT: wALetter 10
+CONSTANT: wSingle_Quote 11
+CONSTANT: wDouble_Quote 12
+CONSTANT: wMidNumLet 13
+CONSTANT: wMidLetter 14
+CONSTANT: wMidNum 15
+CONSTANT: wNumeric 16
+CONSTANT: wExtendNumLet 17
+CONSTANT: wWSegSpace 18
+CONSTANT: unicode-words 19
+>>
+
+<<
+CONSTANT: word-break-table $[
+    "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
+    load-interval-file dup array>> [
+        2 swap [
+            {
+                { "Other" [ wOther ] }
+                { "CR" [ wCR ] }
+                { "LF" [ wLF ] }
+                { "Newline" [ wNewline ] }
+                { "Extend" [ wExtend ] }
+                { "ZWJ" [ wZWJ ]  }
+                { "Regional_Indicator" [ wRegional_Indicator ] }
+                { "Format" [ wFormat ] }
+                { "Katakana" [ wKatakana ] }
+                { "Hebrew_Letter" [ wHebrew_Letter ] }
+                { "ALetter" [ wALetter ] }
+                { "Single_Quote" [ wSingle_Quote ] }
+                { "Double_Quote" [ wDouble_Quote ] }
+                { "MidNumLet" [ wMidNumLet ] }
+                { "MidLetter" [ wMidLetter ] }
+                { "MidNum" [ wMidNum ] }
+                { "Numeric" [ wNumeric ] }
+                { "ExtendNumLet" [ wExtendNumLet ] }
+                { "WSegSpace" [ wWSegSpace ] }
+            } case
+        ] change-nth
+    ] each
+]
 >>
 
 : word-break-prop ( char -- word-break-prop )
     word-break-table interval-at wOther or ;
 
 <<
-SYMBOL: check-letter-before
-SYMBOL: check-letter-after
+SYMBOL: check-AHletter-before
+SYMBOL: check-AHletter-after
+SYMBOL: check-Hebrew-letter-before
+SYMBOL: check-Hebrew-letter-after
 SYMBOL: check-number-before
 SYMBOL: check-number-after
+SYMBOL: check-Extended_Pictographic
+SYMBOL: check-RI-pair
 
 : make-word-table ( -- )
-    { wCR } { wLF } connect
-    { wNewline wCR wLF } unicode-words iota disconnect
-    unicode-words iota { wNewline wCR wLF } disconnect
-    { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
-    { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
-    { wNumeric wALetter } { wNumeric wALetter } connect
-    { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
-    { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
-    { wKatakana } { wKatakana } connect
-    { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
-    [ connect ] [ swap connect ] 2bi ;
+    { wCR } { wLF } connect                                                   ! WB3
+    { wNewline                                                                ! WB3a
+      wCR
+      wLF } unicode-words <iota> disconnect
+    unicode-words <iota> { wNewline                                           ! WB3b
+                           wCR
+                           wLF } disconnect
+    { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
+    { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
+    unicode-words <iota> { wZWJ } connect                                     ! WB4
+    { wALetter                                                                ! WB5
+      wHebrew_Letter } { wALetter
+                         wHebrew_Letter } connect
+    { wALetter                                                                ! WB6
+      wHebrew_Letter } { wMidLetter
+                         wMidNumLet
+                         wSingle_Quote } check-AHletter-after set-table
+    { wMidLetter                                                              ! WB7
+      wMidNumLet
+      wSingle_Quote } { wALetter
+                        wHebrew_Letter } check-AHletter-before set-table
+    { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
+    { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
+    { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
+    { wNumeric } { wNumeric } connect                                         ! WB8
+    { wALetter
+      wHebrew_Letter } { wNumeric } connect                                   ! WB9
+    { wNumeric } { wALetter                                                   ! WB10
+                   wHebrew_Letter } connect
+    { wMidNum                                                                 ! WB11
+      wMidNumLet
+      wSingle_Quote } { wNumeric } check-number-before set-table
+    { wNumeric } { wMidNum                                                    ! WB12
+                   wMidNumLet
+                   wSingle_Quote } check-number-after set-table
+    { wKatakana } { wKatakana } connect                                       ! WB13
+    { wALetter                                                                ! WB13a 
+      wHebrew_Letter
+      wNumeric
+      wKatakana
+      wExtendNumLet } { wExtendNumLet } connect
+    { wExtendNumLet } { wALetter                                              ! WB13b
+                        wHebrew_Letter
+                        wNumeric
+                        wKatakana } connect
+    { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
 
 : finish-word-table ( -- table )
     table get [
         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
     ] map ;
+>>
 
-"word-table" create-word-in
-unicode-words init-table table
-[ make-word-table finish-word-table ] with-variable
-define-constant
+<<
+CONSTANT: word-table $[
+    unicode-words init-table table
+    [ make-word-table finish-word-table ] with-variable
+]
 >>
 
 : word-table-nth ( class1 class2 -- ? )
@@ -178,35 +315,58 @@ define-constant
     ] [ t ] if ;
 
 : (format/extended?) ( class -- ? )
-    ${ wExtend wFormat } member? ; inline
+    ${ wExtend wFormat } member? ; inline                                     ! WB4
 
 : format/extended? ( ch -- ? )
     word-break-prop (format/extended?) ;
 
+: (format/extended/zwj?) ( class -- ? )
+    ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
+
+: format/extended/zwj? ( ch -- ? )
+    word-break-prop (format/extended/zwj?) ;
+
 : (walk-up) ( str i -- j )
-    swap [ format/extended? not ] find-from drop ;
+    swap [ format/extended/zwj? not ] find-from drop ;
 
 : walk-up ( str i -- j )
     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 
 : (walk-down) ( str i -- j )
-    swap [ format/extended? not ] find-last-from drop ;
+    swap [ format/extended/zwj? not ] find-last-from drop ;
 
 : walk-down ( str i -- j )
     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 
-: word-break? ( str i table-entry -- ? )
+:: word-break? ( str i table-entry -- ? )
+    str i table-entry
     {
         { t [ 2drop f ] }
         { f [ 2drop t ] }
-        { check-letter-after
-            [ dupd walk-up wALetter property-not= ] }
-        { check-letter-before
-            [ dupd walk-down wALetter property-not= ] }
+        { check-AHletter-after
+          [ dupd walk-up
+            [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
+        { check-AHletter-before
+          [ dupd walk-down
+            [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
+        { check-Hebrew-letter-after
+          [ dupd walk-up wHebrew_Letter property-not= ] }
+        { check-Hebrew-letter-before
+          [ dupd walk-down wHebrew_Letter property-not= ] }
         { check-number-after
-            [ dupd walk-up wNumeric property-not= ] }
+          [ dupd walk-up wNumeric property-not= ] }
         { check-number-before
-            [ dupd walk-down wNumeric property-not= ] }
+          [ dupd walk-down wNumeric property-not= ] }
+        { check-Extended_Pictographic
+          [ swap ?nth extended-pictographic-table interval-key? ] }
+        { check-RI-pair [
+              2drop 
+              f :> ri-even?!
+              i str [
+                  regional? [ ri-even? not ri-even?! f ] [ t ] if
+              ] find-last-from 2drop
+              ri-even? not
+          ] }
     } case ;
 
 :: word-break-next ( old-class new-char i str -- next-class ? )