]> gitweb.factorcode.org Git - factor.git/blobdiff - basis/unicode/breaks/breaks.factor
factor: trim some using lists
[factor.git] / basis / unicode / breaks / breaks.factor
index 1b1d9434f83e7db961cdcf9c3815d91165c91cd4..f69b0945feab23dad5982c8297f61c2e28ea3a53 100644 (file)
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: combinators.short-circuit unicode.categories kernel math
-combinators splitting sequences math.parser io.files io assocs
-arrays namespaces make math.ranges unicode.normalize
-unicode.normalize.private values io.encodings.ascii
-unicode.data compiler.units fry unicode.categories.syntax
-alien.syntax sets accessors interval-maps memoize locals words
-simple-flat-file ;
+USING: accessors arrays combinators interval-maps kernel
+literals math namespaces sequences simple-flat-file
+unicode.categories unicode.data unicode.normalize.private words ;
 IN: unicode.breaks
 
 <PRIVATE
-! Grapheme breaks
 
-C-ENUM: Any L V T LV LVT Extend Control CR LF
-    SpacingMark Prepend graphemes ;
+<<
+
+:: load-interval-file-for ( filename n key -- table )
+    filename load-data-file [ n swap nth key = ] filter
+    intern-values expand-ranges ;
+
+>>
+
+CONSTANT: emoji-modifier-table $[
+    "resource:basis/unicode/UCD/emoji/emoji-data.txt"
+    1 "Emoji_Modifier" load-interval-file-for
+]
+
+CONSTANT: extended-pictographic-table $[
+    "resource:basis/unicode/UCD/emoji/emoji-data.txt"
+    1 "Extended_Pictographic" load-interval-file-for
+]
+
+CONSTANT: spacing-mark-exceptions-table $[
+    {
+        0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
+        0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
+        0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
+    } <interval-set>
+]
+
+! Grapheme breaks
+<<
+CONSTANT: Any 0
+CONSTANT: L 1
+CONSTANT: V 2
+CONSTANT: T 3
+CONSTANT: LV 4
+CONSTANT: LVT 5
+CONSTANT: Extend 6
+CONSTANT: Control 7
+CONSTANT: CR 8
+CONSTANT: LF 9
+CONSTANT: SpacingMark 10
+CONSTANT: Prepend 11
+CONSTANT: ZWJ 12
+CONSTANT: Extended_Pictographic 13
+CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
+CONSTANT: Regional_Indicator(even) 15
+CONSTANT: Regional_Indicator(odd) 16
+CONSTANT: graphemes 17
 
 : jamo-class ( ch -- class )
     dup initial? [ drop L ]
     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
 
 : hangul-class ( ch -- class )
-    hangul-base - HEX: 1C mod zero? LV LVT ? ;
+    hangul-base - 0x1C mod zero? LV LVT ? ;
+
+CATEGORY: extend
+    Me Mn |
+    "Other_Grapheme_Extend" property? ;
 
 CATEGORY: grapheme-control Zl Zp Cc Cf ;
+
 : control-class ( ch -- class )
     {
-        { CHAR: \r [ CR ] }
-        { CHAR: \n [ LF ] }
-        { HEX: 200C [ Extend ] }
-        { HEX: 200D [ Extend ] }
+        { [ dup CHAR: \r = ]  [ drop CR ] }
+        { [ dup CHAR: \n = ] [ drop LF ] }
+        { [ dup 0x200C = ] [ drop Extend ] }
+        { [ dup 0x200D = ] [ drop ZWJ ] }
+        { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
         [ drop Control ]
-    } case ;
-
-CATEGORY: extend
-    Me Mn |
-    "Other_Grapheme_Extend" property? ;
+    } cond ;
 
 : loe? ( ch -- ? )
     "Logical_Order_Exception" property? ;
 
 CATEGORY: spacing Mc ;
 
-: grapheme-class ( ch -- class )
+: regional? ( ch -- ? )
+    "Regional_Indicator" property? ;
+>>
+
+: modifier? ( ch -- ? )
+    emoji-modifier-table interval-key? ; inline
+
+:: grapheme-class ( str -- class )
+    str last
     {
         { [ dup jamo? ] [ jamo-class ] }
         { [ dup hangul? ] [ hangul-class ] }
-        { [ dup grapheme-control? ] [ control-class ] }
+        { [ dup grapheme-control? ] [
+              control-class dup ZWJ = [
+                  drop
+                  str unclip-last-slice drop dup [
+                      [ extend? ]
+                      [ control-class Extend = ]
+                      [ modifier? ]
+                      tri or or not
+                  ] find-last drop [ swap ?nth ] [ last ] if*
+                  extended-pictographic-table interval-key? [
+                      (Extended_Pictographic-Extend*-)ZWJ
+                  ] [ ZWJ ] if
+              ] when
+          ] }
         { [ dup extend? ] [ drop Extend ] }
-        { [ dup spacing? ] [ drop SpacingMark ] }
-        { [ loe? ] [ Prepend ] }
-        [ Any ]
+        { [ dup modifier? ] [ drop Extend ] }
+        { [ dup spacing? ] [
+               spacing-mark-exceptions-table
+               interval-key? [ Any ] [ SpacingMark ] if ] }
+        { [ dup loe? ] [ drop Prepend ] }
+        { [ dup regional? ] [
+              drop
+              f :> ri-even?!
+              str unclip-last-slice drop [
+                  regional? [ ri-even? not ri-even?! f ] [ t ] if
+              ] find-last 2drop
+              ri-even? [
+                  Regional_Indicator(even)
+              ] [
+                  Regional_Indicator(odd)
+              ] if
+          ] }
+        { [ dup extended-pictographic-table interval-key? ] [
+              drop Extended_Pictographic
+          ] }
+        [ drop Any ]
     } cond ;
 
+<<
 : init-table ( size -- table )
     dup [ f <array> ] curry replicate ;
 
@@ -60,7 +141,8 @@ SYMBOL: table
 : finish-table ( -- table )
     table get [ [ 1 = ] map ] map ;
 
-: eval-seq ( seq -- seq ) [ ?execute ] map ;
+: eval-seq ( seq -- seq )
+    [ dup word? [ execute( -- x ) ] when ] map ;
 
 : (set-table) ( class1 class2 val -- )
     [ table get nth ] dip '[ _ or ] change-nth ;
@@ -71,116 +153,157 @@ SYMBOL: table
 
 : connect ( class1 class2 -- ) 1 set-table ;
 : disconnect ( class1 class2 -- ) 0 set-table ;
-  
-: break-around ( classes1 classes2 -- )
-    [ disconnect ] [ swap disconnect ] 2bi ;
 
 : make-grapheme-table ( -- )
-    { CR } { LF } connect
-    { Control CR LF } graphemes disconnect
-    graphemes { Control CR LF } disconnect
-    { L } { L V LV LVT } connect
-    { LV V } { V T } connect
-    { LVT T } { T } connect
-    graphemes { Extend } connect
-    graphemes { SpacingMark } connect
-    { Prepend } graphemes connect ;
-
-VALUE: grapheme-table
+    { CR } { LF } connect                                                       ! GB3
+    { Control CR LF } graphemes <iota> disconnect                               ! GB4
+    graphemes <iota> { Control CR LF } disconnect                               ! GB5
+    { L } { L V LV LVT } connect                                                ! GB6
+    { LV V } { V T } connect                                                    ! GB7
+    { LVT T } { T } connect                                                     ! GB8
+    graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
+    graphemes <iota> { SpacingMark } connect                                    ! GB9a
+    { Prepend } graphemes <iota> connect                                        ! GB9b
+    { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
+    { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
+>>
+
+CONSTANT: grapheme-table $[
+    graphemes init-table table
+    [ make-grapheme-table finish-table ] with-variable
+]
 
 : grapheme-break? ( class1 class2 -- ? )
     grapheme-table nth nth not ;
 
-: chars ( i str n -- str[i] str[i+n] )
-    swap [ dupd + ] dip [ ?nth ] curry bi@ ;
-
-PRIVATE>
-
-: first-grapheme ( str -- i )
-    unclip-slice grapheme-class over
-    [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
-    nip swap length or 1+ ;
-
-: first-grapheme-from ( start str -- i )
-    over tail-slice first-grapheme + ;
-
-: last-grapheme ( str -- i )
-    unclip-last-slice grapheme-class swap
-    [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
-
-: last-grapheme-from ( end str -- i )
-    swap head-slice last-grapheme ;
-
-<PRIVATE
-
-: >pieces ( str quot: ( str -- i ) -- graphemes )
-    [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
-
-PRIVATE>
-
-: >graphemes ( str -- graphemes )
-    [ first-grapheme ] >pieces ;
-
-: string-reverse ( str -- rts )
-    >graphemes reverse concat ;
-
-<PRIVATE
-
-graphemes init-table table
-[ make-grapheme-table finish-table ] with-variable
-to: grapheme-table
-
 ! Word breaks
-
-VALUE: word-break-table
-
-"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
-to: word-break-table
-
-C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
-wMidNum wMidNumLet wNumeric wExtendNumLet words ;
-
-: word-break-classes ( -- table ) ! Is there a way to avoid this?
-    H{
-        { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
-        { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
-        { "ALetter" 7 } { "MidLetter" 8 }
-        { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
-        { "ExtendNumLet" 12 }
-    } ;
+<<
+CONSTANT: wOther 0
+CONSTANT: wCR 1
+CONSTANT: wLF 2
+CONSTANT: wNewline 3
+CONSTANT: wExtend 4
+CONSTANT: wZWJ 5
+CONSTANT: wRegional_Indicator 6
+CONSTANT: wFormat 7
+CONSTANT: wKatakana 8
+CONSTANT: wHebrew_Letter 9
+CONSTANT: wALetter 10
+CONSTANT: wSingle_Quote 11
+CONSTANT: wDouble_Quote 12
+CONSTANT: wMidNumLet 13
+CONSTANT: wMidLetter 14
+CONSTANT: wMidNum 15
+CONSTANT: wNumeric 16
+CONSTANT: wExtendNumLet 17
+CONSTANT: wWSegSpace 18
+CONSTANT: unicode-words 19
+>>
+
+<<
+CONSTANT: word-break-table $[
+    "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
+    load-interval-file dup array>> [
+        2 swap [
+            {
+                { "Other" [ wOther ] }
+                { "CR" [ wCR ] }
+                { "LF" [ wLF ] }
+                { "Newline" [ wNewline ] }
+                { "Extend" [ wExtend ] }
+                { "ZWJ" [ wZWJ ]  }
+                { "Regional_Indicator" [ wRegional_Indicator ] }
+                { "Format" [ wFormat ] }
+                { "Katakana" [ wKatakana ] }
+                { "Hebrew_Letter" [ wHebrew_Letter ] }
+                { "ALetter" [ wALetter ] }
+                { "Single_Quote" [ wSingle_Quote ] }
+                { "Double_Quote" [ wDouble_Quote ] }
+                { "MidNumLet" [ wMidNumLet ] }
+                { "MidLetter" [ wMidLetter ] }
+                { "MidNum" [ wMidNum ] }
+                { "Numeric" [ wNumeric ] }
+                { "ExtendNumLet" [ wExtendNumLet ] }
+                { "WSegSpace" [ wWSegSpace ] }
+            } case
+        ] change-nth
+    ] each
+]
+>>
 
 : word-break-prop ( char -- word-break-prop )
-    word-break-table interval-at
-    word-break-classes at [ wOther ] unless* ;
+    word-break-table interval-at wOther or ;
 
-SYMBOL: check-letter-before
-SYMBOL: check-letter-after
+<<
+SYMBOL: check-AHletter-before
+SYMBOL: check-AHletter-after
+SYMBOL: check-Hebrew-letter-before
+SYMBOL: check-Hebrew-letter-after
 SYMBOL: check-number-before
 SYMBOL: check-number-after
+SYMBOL: check-Extended_Pictographic
+SYMBOL: check-RI-pair
 
 : make-word-table ( -- )
-    { wCR } { wLF } connect
-    { wNewline wCR wLF } words disconnect
-    words { wNewline wCR wLF } disconnect
-    { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
-    { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
-    { wNumeric wALetter } { wNumeric wALetter } connect
-    { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
-    { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
-    { wKatakana } { wKatakana } connect
-    { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
-    [ connect ] [ swap connect ] 2bi ;
-
-VALUE: word-table
+    { wCR } { wLF } connect                                                   ! WB3
+    { wNewline                                                                ! WB3a
+      wCR
+      wLF } unicode-words <iota> disconnect
+    unicode-words <iota> { wNewline                                           ! WB3b
+                           wCR
+                           wLF } disconnect
+    { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
+    { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
+    unicode-words <iota> { wZWJ } connect                                     ! WB4
+    { wALetter                                                                ! WB5
+      wHebrew_Letter } { wALetter
+                         wHebrew_Letter } connect
+    { wALetter                                                                ! WB6
+      wHebrew_Letter } { wMidLetter
+                         wMidNumLet
+                         wSingle_Quote } check-AHletter-after set-table
+    { wMidLetter                                                              ! WB7
+      wMidNumLet
+      wSingle_Quote } { wALetter
+                        wHebrew_Letter } check-AHletter-before set-table
+    { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
+    { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
+    { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
+    { wNumeric } { wNumeric } connect                                         ! WB8
+    { wALetter
+      wHebrew_Letter } { wNumeric } connect                                   ! WB9
+    { wNumeric } { wALetter                                                   ! WB10
+                   wHebrew_Letter } connect
+    { wMidNum                                                                 ! WB11
+      wMidNumLet
+      wSingle_Quote } { wNumeric } check-number-before set-table
+    { wNumeric } { wMidNum                                                    ! WB12
+                   wMidNumLet
+                   wSingle_Quote } check-number-after set-table
+    { wKatakana } { wKatakana } connect                                       ! WB13
+    { wALetter                                                                ! WB13a 
+      wHebrew_Letter
+      wNumeric
+      wKatakana
+      wExtendNumLet } { wExtendNumLet } connect
+    { wExtendNumLet } { wALetter                                              ! WB13b
+                        wHebrew_Letter
+                        wNumeric
+                        wKatakana } connect
+    { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
 
 : finish-word-table ( -- table )
     table get [
         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
     ] map ;
+>>
 
-words init-table table
-[ make-word-table finish-word-table ] with-variable
-to: word-table
+<<
+CONSTANT: word-table $[
+    unicode-words init-table table
+    [ make-word-table finish-word-table ] with-variable
+]
+>>
 
 : word-table-nth ( class1 class2 -- ? )
     word-table nth nth ;
@@ -191,65 +314,67 @@ to: word-table
         [ f ] if*
     ] [ t ] if ;
 
+: (format/extended?) ( class -- ? )
+    ${ wExtend wFormat } member? ; inline                                     ! WB4
+
 : format/extended? ( ch -- ? )
-    word-break-prop { 4 5 } member? ;
+    word-break-prop (format/extended?) ;
+
+: (format/extended/zwj?) ( class -- ? )
+    ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
+
+: format/extended/zwj? ( ch -- ? )
+    word-break-prop (format/extended/zwj?) ;
 
 : (walk-up) ( str i -- j )
-    swap [ format/extended? not ] find-from drop ;
+    swap [ format/extended/zwj? not ] find-from drop ;
 
 : walk-up ( str i -- j )
-    dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
+    dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 
 : (walk-down) ( str i -- j )
-    swap [ format/extended? not ] find-last-from drop ;
+    swap [ format/extended/zwj? not ] find-last-from drop ;
 
 : walk-down ( str i -- j )
-    dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
+    dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 
-: word-break? ( str i table-entry -- ? )
+:: word-break? ( str i table-entry -- ? )
+    str i table-entry
     {
         { t [ 2drop f ] }
         { f [ 2drop t ] }
-        { check-letter-after
-            [ dupd walk-up wALetter property-not= ] }
-        { check-letter-before
-            [ dupd walk-down wALetter property-not= ] }
+        { check-AHletter-after
+          [ dupd walk-up
+            [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
+        { check-AHletter-before
+          [ dupd walk-down
+            [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
+        { check-Hebrew-letter-after
+          [ dupd walk-up wHebrew_Letter property-not= ] }
+        { check-Hebrew-letter-before
+          [ dupd walk-down wHebrew_Letter property-not= ] }
         { check-number-after
-            [ dupd walk-up wNumeric property-not= ] }
+          [ dupd walk-up wNumeric property-not= ] }
         { check-number-before
-            [ dupd walk-down wNumeric property-not= ] }
+          [ dupd walk-down wNumeric property-not= ] }
+        { check-Extended_Pictographic
+          [ swap ?nth extended-pictographic-table interval-key? ] }
+        { check-RI-pair [
+              2drop 
+              f :> ri-even?!
+              i str [
+                  regional? [ ri-even? not ri-even?! f ] [ t ] if
+              ] find-last-from 2drop
+              ri-even? not
+          ] }
     } case ;
 
 :: word-break-next ( old-class new-char i str -- next-class ? )
-    new-char format/extended?
-    [ old-class dup { 1 2 3 } member? ] [
-        new-char word-break-prop old-class over word-table-nth
-        [ str i ] dip word-break?
+    new-char word-break-prop :> new-class
+    new-class (format/extended?)
+    [ old-class dup ${ wCR wLF wNewline } member? ] [
+        new-class old-class over word-table-nth
+        [ str i 1 - ] dip word-break?
     ] if ;
 
 PRIVATE>
-
-: first-word ( str -- i )
-    [ unclip-slice word-break-prop over <enum> ] keep
-    '[ swap _ word-break-next ] assoc-find 2drop
-    nip swap length or 1+ ;
-
-: >words ( str -- words )
-    [ first-word ] >pieces ;
-
-<PRIVATE
-
-: nth-next ( i str -- str[i-1] str[i] )
-    [ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
-
-PRIVATE>
-
-: word-break-at? ( i str -- ? )
-    {
-        [ drop zero? ]
-        [ length = ]
-        [
-            [ nth-next [ word-break-prop ] dip ] 2keep
-            word-break-next nip
-        ]
-    } 2|| ;