basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors alien.syntax arrays assocs combinators
   4 combinators.short-circuit compiler.units fry interval-maps io
   5 io.encodings.ascii io.files kernel literals locals make math
   6 math.parser math.ranges memoize namespaces parser sequences
   7 sets simple-flat-file splitting unicode.categories
   8 unicode.categories.syntax unicode.data unicode.normalize
   9 unicode.normalize.private words words.constant ;
  10 FROM: sequences => change-nth ;
  11 IN: unicode.breaks
  12
  13 <PRIVATE
  14
  15 ! Grapheme breaks
  16 <<
  17 CONSTANT: Any 0
  18 CONSTANT: L 1
  19 CONSTANT: V 2
  20 CONSTANT: T 3
  21 CONSTANT: LV 4
  22 CONSTANT: LVT 5
  23 CONSTANT: Extend 6
  24 CONSTANT: Control 7
  25 CONSTANT: CR 8
  26 CONSTANT: LF 9
  27 CONSTANT: SpacingMark 10
  28 CONSTANT: Prepend 11
  29 CONSTANT: graphemes 12
  30
  31 : jamo-class ( ch -- class )
  32     dup initial? [ drop L ]
  33     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  34
  35 : hangul-class ( ch -- class )
  36     hangul-base - 0x1C mod zero? LV LVT ? ;
  37
  38 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  39 : control-class ( ch -- class )
  40     {
  41         { CHAR: \r [ CR ] }
  42         { CHAR: \n [ LF ] }
  43         { 0x200C [ Extend ] }
  44         { 0x200D [ Extend ] }
  45         [ drop Control ]
  46     } case ;
  47
  48 CATEGORY: extend
  49     Me Mn |
  50     "Other_Grapheme_Extend" property? ;
  51
  52 : loe? ( ch -- ? )
  53     "Logical_Order_Exception" property? ;
  54
  55 CATEGORY: spacing Mc ;
  56
  57 : grapheme-class ( ch -- class )
  58     {
  59         { [ dup jamo? ] [ jamo-class ] }
  60         { [ dup hangul? ] [ hangul-class ] }
  61         { [ dup grapheme-control? ] [ control-class ] }
  62         { [ dup extend? ] [ drop Extend ] }
  63         { [ dup spacing? ] [ drop SpacingMark ] }
  64         { [ loe? ] [ Prepend ] }
  65         [ Any ]
  66     } cond ;
  67
  68 : init-table ( size -- table )
  69     dup [ f <array> ] curry replicate ;
  70
  71 SYMBOL: table
  72
  73 : finish-table ( -- table )
  74     table get [ [ 1 = ] map ] map ;
  75
  76 : eval-seq ( seq -- seq )
  77     [ dup word? [ execute( -- x ) ] when ] map ;
  78
  79 : (set-table) ( class1 class2 val -- )
  80     [ table get nth ] dip '[ _ or ] change-nth ;
  81
  82 : set-table ( classes1 classes2 val -- )
  83     [ [ eval-seq ] bi@ ] dip
  84     [ [ (set-table) ] curry with each ] 2curry each ;
  85
  86 : connect ( class1 class2 -- ) 1 set-table ;
  87 : disconnect ( class1 class2 -- ) 0 set-table ;
  88
  89 : make-grapheme-table ( -- )
  90     { CR } { LF } connect
  91     { Control CR LF } graphemes iota disconnect
  92     graphemes iota { Control CR LF } disconnect
  93     { L } { L V LV LVT } connect
  94     { LV V } { V T } connect
  95     { LVT T } { T } connect
  96     graphemes iota { Extend } connect
  97     graphemes iota { SpacingMark } connect
  98     { Prepend } graphemes iota connect ;
  99
 100 "grapheme-table" create-word-in
 101 graphemes init-table table
 102 [ make-grapheme-table finish-table ] with-variable
 103 define-constant
 104 >>
 105
 106 : grapheme-break? ( class1 class2 -- ? )
 107     grapheme-table nth nth not ;
 108
 109 PRIVATE>
 110
 111 : first-grapheme ( str -- i )
 112     unclip-slice grapheme-class over
 113     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
 114     nip swap length or 1 + ;
 115
 116 : first-grapheme-from ( start str -- i )
 117     over tail-slice first-grapheme + ;
 118
 119 : last-grapheme ( str -- i )
 120     unclip-last-slice grapheme-class swap
 121     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 122
 123 : last-grapheme-from ( end str -- i )
 124     swap head-slice last-grapheme ;
 125
 126 <PRIVATE
 127
 128 : >pieces ( str quot: ( str -- i ) -- graphemes )
 129     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 130
 131 PRIVATE>
 132
 133 : >graphemes ( str -- graphemes )
 134     [ first-grapheme ] >pieces ;
 135
 136 : string-reverse ( str -- rts )
 137     >graphemes reverse! concat ;
 138
 139 <PRIVATE
 140
 141 ! Word breaks
 142 <<
 143 CONSTANT: wOther 0
 144 CONSTANT: wCR 1
 145 CONSTANT: wLF 2
 146 CONSTANT: wNewline 3
 147 CONSTANT: wExtend 4
 148 CONSTANT: wFormat 5
 149 CONSTANT: wKatakana 6
 150 CONSTANT: wALetter 7
 151 CONSTANT: wMidLetter 8
 152 CONSTANT: wMidNum 9
 153 CONSTANT: wMidNumLet 10
 154 CONSTANT: wNumeric 11
 155 CONSTANT: wExtendNumLet 12
 156 CONSTANT: unicode-words 13
 157
 158 ! Is there a way to avoid this?
 159 CONSTANT: word-break-classes H{
 160     { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 161     { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 162     { "ALetter" 7 } { "MidLetter" 8 }
 163     { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 164     { "ExtendNumLet" 12 }
 165 }
 166
 167 "word-break-table" create-word-in
 168 "vocab:unicode/data/WordBreakProperty.txt"
 169 load-interval-file dup array>>
 170 [ 2 swap [ word-break-classes at ] change-nth ] each
 171 define-constant
 172 >>
 173
 174 : word-break-prop ( char -- word-break-prop )
 175     word-break-table interval-at wOther or ;
 176
 177 <<
 178 SYMBOL: check-letter-before
 179 SYMBOL: check-letter-after
 180 SYMBOL: check-number-before
 181 SYMBOL: check-number-after
 182
 183 : make-word-table ( -- )
 184     { wCR } { wLF } connect
 185     { wNewline wCR wLF } unicode-words iota disconnect
 186     unicode-words iota { wNewline wCR wLF } disconnect
 187     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 188     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 189     { wNumeric wALetter } { wNumeric wALetter } connect
 190     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 191     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 192     { wKatakana } { wKatakana } connect
 193     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 194     [ connect ] [ swap connect ] 2bi ;
 195
 196 : finish-word-table ( -- table )
 197     table get [
 198         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 199     ] map ;
 200
 201 "word-table" create-word-in
 202 unicode-words init-table table
 203 [ make-word-table finish-word-table ] with-variable
 204 define-constant
 205 >>
 206
 207 : word-table-nth ( class1 class2 -- ? )
 208     word-table nth nth ;
 209
 210 :: property-not= ( str i property -- ? )
 211     i [
 212         i str ?nth [ word-break-prop property = not ]
 213         [ f ] if*
 214     ] [ t ] if ;
 215
 216 : (format/extended?) ( class -- ? )
 217     ${ wExtend wFormat } member? ; inline
 218
 219 : format/extended? ( ch -- ? )
 220     word-break-prop (format/extended?) ;
 221
 222 : (walk-up) ( str i -- j )
 223     swap [ format/extended? not ] find-from drop ;
 224
 225 : walk-up ( str i -- j )
 226     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 227
 228 : (walk-down) ( str i -- j )
 229     swap [ format/extended? not ] find-last-from drop ;
 230
 231 : walk-down ( str i -- j )
 232     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 233
 234 : word-break? ( str i table-entry -- ? )
 235     {
 236         { t [ 2drop f ] }
 237         { f [ 2drop t ] }
 238         { check-letter-after
 239             [ dupd walk-up wALetter property-not= ] }
 240         { check-letter-before
 241             [ dupd walk-down wALetter property-not= ] }
 242         { check-number-after
 243             [ dupd walk-up wNumeric property-not= ] }
 244         { check-number-before
 245             [ dupd walk-down wNumeric property-not= ] }
 246     } case ;
 247
 248 :: word-break-next ( old-class new-char i str -- next-class ? )
 249     new-char word-break-prop :> new-class
 250     new-class (format/extended?)
 251     [ old-class dup ${ wCR wLF wNewline } member? ] [
 252         new-class old-class over word-table-nth
 253         [ str i 1 - ] dip word-break?
 254     ] if ;
 255
 256 PRIVATE>
 257
 258  : first-word ( str -- i )
 259     [ [ length ] [ first word-break-prop ] bi ] keep
 260     1 swap dup '[ _ word-break-next ] find-index-from
 261     drop nip swap or ;
 262
 263 : >words ( str -- words )
 264     [ first-word ] >pieces ;
 265
 266 <PRIVATE
 267
 268 : nth-next ( i str -- str[i-1] str[i] )
 269     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 270
 271 PRIVATE>
 272
 273 : word-break-at? ( i str -- ? )
 274     {
 275         [ drop zero? ]
 276         [ length = ]
 277         [
 278             [ nth-next [ word-break-prop ] dip ] 2keep
 279             word-break-next nip
 280         ]
 281     } 2|| ;
 282
 283 : first-word-from ( start str -- i )
 284     over tail-slice first-word + ;
 285
 286 : last-word ( str -- i )
 287     [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 288
 289 : last-word-from ( end str -- i )
 290     swap head-slice last-word ;