basis/xmode/marker/marker.factor

   1 ! Copyright (C) 2008 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3
   4 USING: accessors ascii assocs combinators
   5 combinators.short-circuit formatting kernel make math namespaces
   6 regexp regexp.parser sequences splitting strings
   7 xmode.marker.state xmode.rules xmode.tokens xmode.utilities ;
   8
   9 IN: xmode.marker
  10
  11 ! Next two words copied from parser-combinators
  12 ! Just like head?, but they optionally ignore case
  13
  14 : string= ( str1 str2 ignore-case -- ? )
  15     [ [ >upper ] bi@ ] when sequence= ;
  16
  17 : string-head? ( str1 str2 ignore-case -- ? )
  18     2over shorter?
  19     [ 3drop f ] [
  20         [
  21             [ nip ]
  22             [ length head-slice ] 2bi
  23         ] dip string=
  24     ] if ;
  25
  26 ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  27
  28 : current-keyword ( -- string )
  29     last-offset get position get line get subseq ;
  30
  31 : keyword-number? ( keyword -- ? )
  32     {
  33         [ current-rule-set highlight-digits?>> ]
  34         [ dup [ digit? ] any? ]
  35         [
  36             dup [ digit? ] all? [
  37                 current-rule-set digit-re>>
  38                 [ dupd matches? ] [ f ] if*
  39             ] unless*
  40         ]
  41     } 0&& nip ;
  42
  43 : mark-number ( keyword -- id )
  44     keyword-number? DIGIT and ;
  45
  46 : mark-keyword ( keyword -- id )
  47     current-rule-set keywords>> at ;
  48
  49 : add-remaining-token ( -- )
  50     current-rule-set default>> prev-token, ;
  51
  52 : mark-token ( -- )
  53     current-keyword
  54     dup mark-number [ ] [ mark-keyword ] ?if
  55     [ prev-token, ] when* ;
  56
  57 : current-char ( -- char )
  58     position get line get nth ;
  59
  60 GENERIC: match-position ( rule -- n )
  61
  62 M: mark-previous-rule match-position drop last-offset get ;
  63
  64 M: rule match-position drop position get ;
  65
  66 : can-match-here? ( matcher rule -- ? )
  67     match-position {
  68         [ over ]
  69         [ over at-line-start?>>     over zero?                implies ]
  70         [ over at-whitespace-end?>> over whitespace-end get = implies ]
  71         [ over at-word-start?>>     over last-offset get =    implies ]
  72     } 0&& 2nip ;
  73
  74 : rest-of-line ( -- str )
  75     line get position get tail-slice ;
  76
  77 : match-start ( string regexp -- slice/f )
  78     first-match dup [ dup from>> 0 = [ drop f ] unless ] when ;
  79
  80 GENERIC: text-matches? ( string text -- match-count/f )
  81
  82 M: f text-matches?
  83     2drop f ;
  84
  85 M: string-matcher text-matches?
  86     [ string>> ] [ ignore-case?>> ] bi
  87     [ string-head? ] keepd length and ;
  88
  89 M: regexp text-matches?
  90     [ >string ] dip match-start dup [ to>> ] when ;
  91
  92 <PRIVATE
  93
  94 ! XXX: Terrible inefficient regexp match group support
  95
  96 : #match-groups ( regexp -- n/f )
  97     raw>> [ CHAR: ( = ] count [ f ] when-zero ;
  98
  99 : nth-index ( n obj seq -- i )
 100     [ = dup [ drop 1 - dup 0 < ] when ] with find drop nip ;
 101
 102 : match-group-regexp ( regexp n -- skip-regexp match-regexp )
 103     [ [ options>> options>string ] [ raw>> ] bi ] dip
 104     CHAR: ( pick nth-index cut CHAR: ) over index 1 + head
 105     rot '[ _ <optioned-regexp> ] bi@ ;
 106
 107 : skip-first-match ( match regexp -- tailseq )
 108     first-match [ seq>> ] [ to>> ] bi tail ;
 109
 110 : nth-match ( match regexp n -- slice/f )
 111     match-group-regexp [ skip-first-match ] [ first-match ] bi* ;
 112
 113 : update-match-group ( str match regexp n -- str' )
 114     [ nth-match ] [ CHAR: 1 + "$%c" sprintf ] bi swap replace ;
 115
 116 : update-match-groups ( str match regexp -- str' )
 117     [ >string ] dip
 118     dup #match-groups [ update-match-group ] 2with each-integer ;
 119
 120 GENERIC: fixup-end ( match regexp end -- end' )
 121
 122 M: string-matcher fixup-end
 123     [ string>> -rot update-match-groups ]
 124     [ ignore-case?>> ] bi <string-matcher> ;
 125
 126 MEMO: <fixup-regexp> ( raw matched options -- regexp )
 127     <optioned-regexp> {
 128         [ parse-tree>> ] [ options>> ] [ dfa>> ] [ next-match>> ]
 129     } cleave regexp boa ;
 130
 131 M: regexp fixup-end
 132     [ raw>> [ -rot update-match-groups ] keep swap ]
 133     [ options>> options>string ] bi <fixup-regexp> ;
 134
 135 : fixup-end? ( text -- ? )
 136     { [ regexp? ] [ #match-groups ] } 1&& ;
 137
 138 : fixup-end/text-matches? ( string regexp rule -- match-count/f )
 139     [ >string ] 2dip [ [ match-start dup ] keep ] dip pick [
 140         end>> [ [ fixup-end ] change-text drop ] [ 2drop ] if*
 141     ] [
 142         3drop
 143     ] if dup [ to>> ] when ;
 144
 145 PRIVATE>
 146
 147 :: rule-start-matches? ( rule -- match-count/f )
 148     rule start>> dup rule can-match-here? [
 149         rest-of-line swap text>>
 150         dup fixup-end? [
 151             rule fixup-end/text-matches?
 152         ] [
 153             text-matches?
 154         ] if
 155     ] [
 156         drop f
 157     ] if ;
 158
 159 : rule-end-matches? ( rule -- match-count/f )
 160     dup mark-following-rule? [
 161         [ start>> ] keep can-match-here? 0 and
 162     ] [
 163         [ end>> dup ] keep can-match-here? [
 164             rest-of-line
 165             swap text>> context get end>> or
 166             text-matches?
 167         ] [
 168             drop f
 169         ] if
 170     ] if ;
 171
 172 DEFER: get-rules
 173
 174 : get-always-rules ( vector/f ruleset -- vector/f )
 175     f swap rules>> at ?push-all ;
 176
 177 : get-char-rules ( vector/f char ruleset -- vector/f )
 178     [ ch>upper ] dip rules>> at ?push-all ;
 179
 180 : get-rules ( char ruleset -- seq )
 181     [ f ] 2dip [ get-char-rules ] keep get-always-rules ;
 182
 183 GENERIC: handle-rule-start ( match-count rule -- )
 184
 185 GENERIC: handle-rule-end ( match-count rule -- )
 186
 187 : find-escape-rule ( -- rule )
 188     context get dup
 189     in-rule-set>> escape-rule>> [ ] [
 190         parent>> in-rule-set>>
 191         dup [ escape-rule>> ] when
 192     ] ?if ;
 193
 194 : check-escape-rule ( rule -- ? )
 195     escape-rule>> [ find-escape-rule ] unless*
 196     dup [
 197         dup rule-start-matches? [
 198             swap handle-rule-start
 199             delegate-end-escaped? toggle
 200             t
 201         ] [
 202             drop f
 203         ] if*
 204     ] when ;
 205
 206 : check-every-rule ( -- ? )
 207     current-char current-rule-set get-rules
 208     [ rule-start-matches? ] map-find
 209     [ handle-rule-start t ] [ drop f ] if* ;
 210
 211 : ?end-rule ( -- )
 212     current-rule [
 213         dup rule-end-matches?
 214         [ swap handle-rule-end ] [ drop ] if*
 215     ] when* ;
 216
 217 : rule-match-token* ( rule -- id )
 218     dup match-token>> {
 219         { f [ dup body-token>> ] }
 220         { t [ current-rule-set default>> ] }
 221         [ ]
 222     } case nip ;
 223
 224 M: escape-rule handle-rule-start
 225     drop
 226     ?end-rule
 227     process-escape? get [
 228         escaped? toggle
 229         position [ + ] change
 230     ] [ drop ] if ;
 231
 232 M: seq-rule handle-rule-start
 233     ?end-rule
 234     mark-token
 235     add-remaining-token
 236     [ body-token>> next-token, ] keep
 237     delegate>> [ push-context ] when* ;
 238
 239 UNION: abstract-span-rule span-rule eol-span-rule ;
 240
 241 M: abstract-span-rule handle-rule-start
 242     ?end-rule
 243     mark-token
 244     add-remaining-token
 245     [ rule-match-token* next-token, ] keep
 246     ! ... end subst ...
 247     dup context get in-rule<<
 248     delegate>> push-context ;
 249
 250 M: span-rule handle-rule-end
 251     2drop ;
 252
 253 M: mark-following-rule handle-rule-start
 254     ?end-rule
 255     mark-token add-remaining-token
 256     [ rule-match-token* next-token, ] keep
 257     f context get end<<
 258     context get in-rule<< ;
 259
 260 M: mark-following-rule handle-rule-end
 261     nip rule-match-token* prev-token,
 262     f context get in-rule<< ;
 263
 264 M: mark-previous-rule handle-rule-start
 265     ?end-rule
 266     mark-token
 267     dup body-token>> prev-token,
 268     rule-match-token* next-token, ;
 269
 270 : do-escaped ( -- )
 271     escaped? get [
 272         escaped? off
 273         ! ...
 274     ] when ;
 275
 276 : check-end-delegate ( -- ? )
 277     context get parent>> [
 278         in-rule>> [
 279             dup rule-end-matches? [
 280                 [
 281                     swap handle-rule-end
 282                     ?end-rule
 283                     mark-token
 284                     add-remaining-token
 285                 ] keep context get parent>> in-rule>>
 286                 rule-match-token* next-token,
 287                 pop-context
 288                 seen-whitespace-end? on t
 289             ] [ check-escape-rule ] if*
 290         ] [ f ] if*
 291     ] [ f ] if* ;
 292
 293 : handle-no-word-break ( -- )
 294     context get parent>> [
 295         in-rule>> [
 296             dup no-word-break?>> [
 297                 rule-match-token* prev-token,
 298                 pop-context
 299             ] [ drop ] if
 300         ] when*
 301     ] when* ;
 302
 303 : check-rule ( -- )
 304     ?end-rule
 305     handle-no-word-break
 306     mark-token
 307     add-remaining-token ;
 308
 309 : (check-word-break) ( -- )
 310     check-rule
 311
 312     1 current-rule-set default>> next-token, ;
 313
 314 : rule-set-empty? ( ruleset -- ? )
 315     [ rules>> ] [ keywords>> ] bi
 316     [ assoc-empty? ] both? ;
 317
 318 : check-word-break ( -- ? )
 319     current-char dup blank? [
 320         drop
 321
 322         seen-whitespace-end? get [
 323             position get 1 + whitespace-end set
 324         ] unless
 325
 326         (check-word-break)
 327
 328     ] [
 329         ! Micro-optimization with incorrect semantics; we keep
 330         ! it here because jEdit mode files depend on it now...
 331         current-rule-set rule-set-empty? [
 332             drop
 333         ] [
 334             dup alpha? [
 335                 drop
 336             ] [
 337                 current-rule-set rule-set-no-word-sep* member? [
 338                     (check-word-break)
 339                 ] unless
 340             ] if
 341         ] if
 342
 343         seen-whitespace-end? on
 344     ] if
 345     escaped? off
 346     delegate-end-escaped? off t ;
 347
 348
 349 : mark-token-loop ( -- )
 350     position get line get length < [
 351         {
 352             [ check-end-delegate ]
 353             [ check-every-rule ]
 354             [ check-word-break ]
 355         } 0|| drop
 356
 357         position inc
 358         mark-token-loop
 359     ] when ;
 360
 361 : mark-remaining ( -- )
 362     line get length position set
 363     check-rule ;
 364
 365 : unwind-no-line-break ( -- )
 366     context get parent>> [
 367         in-rule>> [
 368             no-line-break?>> [
 369                 pop-context
 370                 unwind-no-line-break
 371             ] when
 372         ] when*
 373     ] when* ;
 374
 375 : tokenize-line ( line-context line rules -- line-context' seq )
 376     [
 377         "MAIN" of -rot
 378         init-token-marker
 379         mark-token-loop
 380         mark-remaining
 381         unwind-no-line-break
 382         context get
 383     ] { } make ;