basis/xmode/marker/marker.factor

   1 ! Copyright (C) 2008 Slava Pestov.
   2 ! See https://factorcode.org/license.txt for BSD license.
   3
   4 USING: accessors ascii assocs combinators
   5 combinators.short-circuit formatting kernel make math namespaces
   6 regexp regexp.parser sequences splitting strings
   7 xmode.marker.state xmode.rules xmode.tokens xmode.utilities ;
   8
   9 IN: xmode.marker
  10
  11 ! Next two words copied from parser-combinators
  12 ! Just like head?, but they optionally ignore case
  13
  14 : string= ( str1 str2 ignore-case -- ? )
  15     [ [ >upper ] bi@ ] when sequence= ;
  16
  17 : string-head? ( str1 str2 ignore-case -- ? )
  18     2over shorter?
  19     [ 3drop f ] [
  20         [
  21             [ nip ]
  22             [ length head-slice ] 2bi
  23         ] dip string=
  24     ] if ;
  25
  26 ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  27
  28 : current-keyword ( -- string )
  29     last-offset get position get line get subseq ;
  30
  31 : keyword-number? ( keyword -- ? )
  32     {
  33         [ current-rule-set highlight-digits?>> ]
  34         [ dup [ digit? ] any? ]
  35         [
  36             dup [ digit? ] all? [
  37                 current-rule-set digit-re>>
  38                 [ dupd matches? ] [ f ] if*
  39             ] unless*
  40         ]
  41     } 0&& nip ;
  42
  43 : mark-number ( keyword -- id )
  44     keyword-number? DIGIT and ;
  45
  46 : mark-keyword ( keyword -- id )
  47     current-rule-set keywords>> at ;
  48
  49 : add-remaining-token ( -- )
  50     current-rule-set default>> prev-token, ;
  51
  52 : mark-token ( -- )
  53     current-keyword
  54     dup mark-number [ ] [ mark-keyword ] ?if
  55     [ prev-token, ] when* ;
  56
  57 : current-char ( -- char )
  58     position get line get nth ;
  59
  60 GENERIC: match-position ( rule -- n )
  61
  62 M: mark-previous-rule match-position drop last-offset get ;
  63
  64 M: rule match-position drop position get ;
  65
  66 : can-match-here? ( matcher rule -- ? )
  67     match-position {
  68         [ over ]
  69         [ over at-line-start?>>     over zero?                implies ]
  70         [ over at-whitespace-end?>> over whitespace-end get = implies ]
  71         [ over at-word-start?>>     over last-offset get =    implies ]
  72     } 0&& 2nip ;
  73
  74 : rest-of-line ( -- str )
  75     line get position get tail-slice ;
  76
  77 : match-start ( string regexp -- slice/f )
  78     first-match dup [ dup from>> 0 = [ drop f ] unless ] when ;
  79
  80 GENERIC: text-matches? ( string text -- match-count/f )
  81
  82 M: f text-matches?
  83     2drop f ;
  84
  85 M: string-matcher text-matches?
  86     [ string>> ] [ ignore-case?>> ] bi
  87     [ string-head? ] keepd length and ;
  88
  89 M: regexp text-matches?
  90     [ >string ] dip match-start dup [ to>> ] when ;
  91
  92 <PRIVATE
  93
  94 ! XXX: Terrible inefficient regexp match group support
  95
  96 ! XXX: support named-capturing groups?
  97
  98 : group-start ( i raw -- n/f )
  99     [ CHAR: ( -rot index-from ] keep 2dup
 100     { [ drop ] [ [ 1 + ] dip ?nth CHAR: ? = ] } 2&&
 101     [ [ 1 + ] dip group-start ] [ drop ] if ;
 102
 103 : nth-group-start ( n raw -- n )
 104     [ -1 ] 2dip '[ dup [ 1 + _ group-start ] when ] times ;
 105
 106 : matching-paren ( str -- to )
 107     0 swap [
 108         {
 109             { CHAR: ( [ 1 + ] }
 110             { CHAR: ) [ 1 - ] }
 111             [ drop ]
 112         } case dup zero?
 113     ] find drop nip ;
 114
 115 : nth-group ( n raw -- before nth )
 116     [ nth-group-start ] keep swap cut dup matching-paren 1 + head ;
 117
 118 : match-group-regexp ( regexp n -- skip-regexp match-regexp )
 119     [ [ options>> options>string ] [ raw>> ] bi ] dip swap
 120     nth-group rot '[ _ H{ } [ <optioned-regexp> ] 2cache ] bi@ ;
 121
 122 : skip-first-match ( match regexp -- tailseq )
 123     [ >string ] dip first-match [ seq>> ] [ to>> ] bi tail ;
 124
 125 : nth-match ( match regexp n -- slice/f )
 126     match-group-regexp [ skip-first-match ] [ match-start ] bi* ;
 127
 128 : update-match-groups ( str match regexp -- str' )
 129     pick CHAR: $ swap index [
 130         R/ [$]\d/ [ second CHAR: 0 - nth-match ] 2with re-replace-with
 131     ] [ 2drop ] if ;
 132
 133 GENERIC: fixup-end ( match regexp end -- end' )
 134
 135 M: string-matcher fixup-end
 136     [ string>> -rot update-match-groups ]
 137     [ ignore-case?>> ] bi <string-matcher> ;
 138
 139 MEMO: <fixup-regexp> ( raw matched options -- regexp )
 140     <optioned-regexp> {
 141         [ parse-tree>> ] [ options>> ] [ dfa>> ] [ next-match>> ]
 142     } cleave regexp boa ;
 143
 144 M: regexp fixup-end
 145     [ raw>> [ -rot update-match-groups ] keep swap ]
 146     [ options>> options>string ] bi <fixup-regexp> ;
 147
 148 : fixup-end? ( text -- ? )
 149     { [ regexp? ] [ 0 swap raw>> group-start ] } 1&& ;
 150
 151 : fixup-end/text-matches? ( string regexp rule -- match-count/f )
 152     [ >string ] 2dip [ [ match-start dup ] keep ] dip pick [
 153         end>> [ [ fixup-end ] change-text drop ] [ 2drop ] if*
 154     ] [
 155         3drop
 156     ] if dup [ to>> ] when ;
 157
 158 PRIVATE>
 159
 160 :: rule-start-matches? ( rule -- match-count/f )
 161     rule start>> dup rule can-match-here? [
 162         rest-of-line swap text>>
 163         dup fixup-end? [
 164             rule fixup-end/text-matches?
 165         ] [
 166             text-matches?
 167         ] if
 168     ] [
 169         drop f
 170     ] if ;
 171
 172 : rule-end-matches? ( rule -- match-count/f )
 173     dup mark-following-rule? [
 174         [ start>> ] keep can-match-here? 0 and
 175     ] [
 176         [ end>> dup ] keep can-match-here? [
 177             rest-of-line
 178             swap text>> context get end>> or
 179             text-matches?
 180         ] [
 181             drop f
 182         ] if
 183     ] if ;
 184
 185 DEFER: get-rules
 186
 187 : get-always-rules ( ruleset -- vector/f )
 188     f swap rules>> at ;
 189
 190 : get-char-rules ( char ruleset -- vector/f )
 191     [ ch>upper ] dip rules>> at ;
 192
 193 : get-rules ( char ruleset -- seq )
 194     [ get-char-rules ] [ get-always-rules ] bi [ append ] when* ;
 195
 196 GENERIC: handle-rule-start ( match-count rule -- )
 197
 198 GENERIC: handle-rule-end ( match-count rule -- )
 199
 200 : find-escape-rule ( -- rule )
 201     context get dup
 202     in-rule-set>> escape-rule>> [ ] [
 203         parent>> in-rule-set>>
 204         dup [ escape-rule>> ] when
 205     ] ?if ;
 206
 207 : check-escape-rule ( rule -- ? )
 208     escape-rule>> [ find-escape-rule ] unless*
 209     dup [
 210         dup rule-start-matches? [
 211             swap handle-rule-start
 212             delegate-end-escaped? toggle
 213             t
 214         ] [
 215             drop f
 216         ] if*
 217     ] when ;
 218
 219 : check-every-rule ( -- ? )
 220     current-char current-rule-set get-rules
 221     [ rule-start-matches? ] map-find
 222     [ handle-rule-start t ] [ drop f ] if* ;
 223
 224 : ?end-rule ( -- )
 225     current-rule [
 226         dup rule-end-matches?
 227         [ swap handle-rule-end ] [ drop ] if*
 228     ] when* ;
 229
 230 : rule-match-token* ( rule -- id )
 231     dup match-token>> {
 232         { f [ dup body-token>> ] }
 233         { t [ current-rule-set default>> ] }
 234         [ ]
 235     } case nip ;
 236
 237 M: escape-rule handle-rule-start
 238     drop
 239     ?end-rule
 240     process-escape? get [
 241         escaped? toggle
 242         position [ + ] change
 243     ] [ drop ] if ;
 244
 245 M: seq-rule handle-rule-start
 246     ?end-rule
 247     mark-token
 248     add-remaining-token
 249     [ body-token>> next-token, ] keep
 250     delegate>> [ push-context ] when* ;
 251
 252 UNION: abstract-span-rule span-rule eol-span-rule ;
 253
 254 M: abstract-span-rule handle-rule-start
 255     ?end-rule
 256     mark-token
 257     add-remaining-token
 258     [ rule-match-token* next-token, ] keep
 259     ! ... end subst ...
 260     dup context get in-rule<<
 261     delegate>> push-context ;
 262
 263 M: span-rule handle-rule-end
 264     2drop ;
 265
 266 M: mark-following-rule handle-rule-start
 267     ?end-rule
 268     mark-token add-remaining-token
 269     [ rule-match-token* next-token, ] keep
 270     f context get end<<
 271     context get in-rule<< ;
 272
 273 M: mark-following-rule handle-rule-end
 274     nip rule-match-token* prev-token,
 275     f context get in-rule<< ;
 276
 277 M: mark-previous-rule handle-rule-start
 278     ?end-rule
 279     mark-token
 280     dup body-token>> prev-token,
 281     rule-match-token* next-token, ;
 282
 283 : do-escaped ( -- )
 284     escaped? get [
 285         escaped? off
 286         ! ...
 287     ] when ;
 288
 289 : check-end-delegate ( -- ? )
 290     context get parent>> [
 291         in-rule>> [
 292             dup rule-end-matches? [
 293                 [
 294                     swap handle-rule-end
 295                     ?end-rule
 296                     mark-token
 297                     add-remaining-token
 298                 ] keep context get parent>> in-rule>>
 299                 rule-match-token* next-token,
 300                 pop-context
 301                 seen-whitespace-end? on t
 302             ] [ check-escape-rule ] if*
 303         ] [ f ] if*
 304     ] [ f ] if* ;
 305
 306 : handle-no-word-break ( -- )
 307     context get parent>> [
 308         in-rule>> [
 309             dup no-word-break?>> [
 310                 rule-match-token* prev-token,
 311                 pop-context
 312             ] [ drop ] if
 313         ] when*
 314     ] when* ;
 315
 316 : check-rule ( -- )
 317     ?end-rule
 318     handle-no-word-break
 319     mark-token
 320     add-remaining-token ;
 321
 322 : (check-word-break) ( -- )
 323     check-rule
 324
 325     1 current-rule-set default>> next-token, ;
 326
 327 : rule-set-empty? ( ruleset -- ? )
 328     [ rules>> ] [ keywords>> ] bi
 329     [ assoc-empty? ] both? ;
 330
 331 : check-word-break ( -- ? )
 332     current-char dup blank? [
 333         drop
 334
 335         seen-whitespace-end? get [
 336             position get 1 + whitespace-end set
 337         ] unless
 338
 339         (check-word-break)
 340
 341     ] [
 342         ! Micro-optimization with incorrect semantics; we keep
 343         ! it here because jEdit mode files depend on it now...
 344         current-rule-set rule-set-empty? [
 345             drop
 346         ] [
 347             dup alpha? [
 348                 drop
 349             ] [
 350                 current-rule-set rule-set-no-word-sep* member? [
 351                     (check-word-break)
 352                 ] unless
 353             ] if
 354         ] if
 355
 356         seen-whitespace-end? on
 357     ] if
 358     escaped? off
 359     delegate-end-escaped? off t ;
 360
 361
 362 : mark-token-loop ( -- )
 363     position get line get length < [
 364         {
 365             [ check-end-delegate ]
 366             [ check-every-rule ]
 367             [ check-word-break ]
 368         } 0|| drop
 369
 370         position inc
 371         mark-token-loop
 372     ] when ;
 373
 374 : mark-remaining ( -- )
 375     line get length position set
 376     check-rule ;
 377
 378 : unwind-no-line-break ( -- )
 379     context get parent>> [
 380         in-rule>> [
 381             no-line-break?>> [
 382                 pop-context
 383                 unwind-no-line-break
 384             ] when
 385         ] when*
 386     ] when* ;
 387
 388 : tokenize-line ( line-context line rules -- line-context' seq )
 389     [
 390         "MAIN" of -rot
 391         init-token-marker
 392         mark-token-loop
 393         mark-remaining
 394         unwind-no-line-break
 395         context get
 396     ] { } make ;