basis/xmode/marker/marker.factor

   1 ! Copyright (C) 2008 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 IN: xmode.marker
   4 USING: kernel namespaces make xmode.rules xmode.tokens
   5 xmode.marker.state xmode.marker.context xmode.utilities
   6 xmode.catalog sequences math assocs combinators strings
   7 regexp splitting ascii regexp.backend unicode.case
   8 ascii combinators.short-circuit accessors ;
   9 ! regexp.backend is for the regexp class
  10
  11 ! Next two words copied from parser-combinators
  12 ! Just like head?, but they optionally ignore case
  13
  14 : string= ( str1 str2 ignore-case -- ? )
  15     [ [ >upper ] bi@ ] when sequence= ;
  16
  17 : string-head? ( str1 str2 ignore-case -- ? )
  18     2over shorter?
  19     [ 3drop f ] [
  20         [
  21             [ nip ]
  22             [ length head-slice ] 2bi
  23         ] dip string=
  24     ] if ;
  25
  26 ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  27
  28 : current-keyword ( -- string )
  29     last-offset get position get line get subseq ;
  30
  31 : keyword-number? ( keyword -- ? )
  32     {
  33         [ current-rule-set highlight-digits?>> ]
  34         [ dup [ digit? ] any? ]
  35         [
  36             dup [ digit? ] all? [
  37                 current-rule-set digit-re>>
  38                 dup [ dupd matches? ] [ drop f ] if
  39             ] unless*
  40         ]
  41     } 0&& nip ;
  42
  43 : mark-number ( keyword -- id )
  44     keyword-number? DIGIT and ;
  45
  46 : mark-keyword ( keyword -- id )
  47     current-rule-set keywords>> at ;
  48
  49 : add-remaining-token ( -- )
  50     current-rule-set default>> prev-token, ;
  51
  52 : mark-token ( -- )
  53     current-keyword
  54     dup mark-number [ ] [ mark-keyword ] ?if
  55     [ prev-token, ] when* ;
  56
  57 : current-char ( -- char )
  58     position get line get nth ;
  59
  60 GENERIC: match-position ( rule -- n )
  61
  62 M: mark-previous-rule match-position drop last-offset get ;
  63
  64 M: rule match-position drop position get ;
  65
  66 : can-match-here? ( matcher rule -- ? )
  67     match-position {
  68         [ over ]
  69         [ over at-line-start?>>     over zero?                implies ]
  70         [ over at-whitespace-end?>> over whitespace-end get = implies ]
  71         [ over at-word-start?>>     over last-offset get =    implies ]
  72     } 0&& 2nip ;
  73
  74 : rest-of-line ( -- str )
  75     line get position get tail-slice ;
  76
  77 GENERIC: text-matches? ( string text -- match-count/f )
  78
  79 M: f text-matches?
  80     2drop f ;
  81
  82 M: string-matcher text-matches?
  83     [
  84         [ string>> ] [ ignore-case?>> ] bi string-head?
  85     ] keep string>> length and ;
  86
  87 M: regexp text-matches?
  88     [ >string ] dip match-head ;
  89
  90 : rule-start-matches? ( rule -- match-count/f )
  91     dup start>> tuck swap can-match-here? [
  92         rest-of-line swap text>> text-matches?
  93     ] [
  94         drop f
  95     ] if ;
  96
  97 : rule-end-matches? ( rule -- match-count/f )
  98     dup mark-following-rule? [
  99         dup start>> swap can-match-here? 0 and
 100     ] [
 101         dup end>> tuck swap can-match-here? [
 102             rest-of-line
 103             swap text>> context get end>> or
 104             text-matches?
 105         ] [
 106             drop f
 107         ] if
 108     ] if ;
 109
 110 DEFER: get-rules
 111
 112 : get-always-rules ( vector/f ruleset -- vector/f )
 113     f swap rules>> at ?push-all ;
 114
 115 : get-char-rules ( vector/f char ruleset -- vector/f )
 116     [ ch>upper ] dip rules>> at ?push-all ;
 117
 118 : get-rules ( char ruleset -- seq )
 119     [ f ] 2dip [ get-char-rules ] keep get-always-rules ;
 120
 121 GENERIC: handle-rule-start ( match-count rule -- )
 122
 123 GENERIC: handle-rule-end ( match-count rule -- )
 124
 125 : find-escape-rule ( -- rule )
 126     context get dup
 127     in-rule-set>> escape-rule>> [ ] [
 128         parent>> in-rule-set>>
 129         dup [ escape-rule>> ] when
 130     ] ?if ;
 131
 132 : check-escape-rule ( rule -- ? )
 133     no-escape?>> [ f ] [
 134         find-escape-rule dup [
 135             dup rule-start-matches? dup [
 136                 swap handle-rule-start
 137                 delegate-end-escaped? [ not ] change
 138                 t
 139             ] [
 140                 2drop f
 141             ] if
 142         ] when
 143     ] if ;
 144
 145 : check-every-rule ( -- ? )
 146     current-char current-rule-set get-rules
 147     [ rule-start-matches? ] map-find
 148     dup [ handle-rule-start t ] [ 2drop f ] if ;
 149
 150 : ?end-rule ( -- )
 151     current-rule [
 152         dup rule-end-matches?
 153         dup [ swap handle-rule-end ] [ 2drop ] if
 154     ] when* ;
 155
 156 : rule-match-token* ( rule -- id )
 157     dup match-token>> {
 158         { f [ dup body-token>> ] }
 159         { t [ current-rule-set default>> ] }
 160         [ ]
 161     } case nip ;
 162
 163 M: escape-rule handle-rule-start
 164     drop
 165     ?end-rule
 166     process-escape? get [
 167         escaped? [ not ] change
 168         position [ + ] change
 169     ] [ drop ] if ;
 170
 171 M: seq-rule handle-rule-start
 172     ?end-rule
 173     mark-token
 174     add-remaining-token
 175     tuck body-token>> next-token,
 176     delegate>> [ push-context ] when* ;
 177
 178 UNION: abstract-span-rule span-rule eol-span-rule ;
 179
 180 M: abstract-span-rule handle-rule-start
 181     ?end-rule
 182     mark-token
 183     add-remaining-token
 184     tuck rule-match-token* next-token,
 185     ! ... end subst ...
 186     dup context get (>>in-rule)
 187     delegate>> push-context ;
 188
 189 M: span-rule handle-rule-end
 190     2drop ;
 191
 192 M: mark-following-rule handle-rule-start
 193     ?end-rule
 194     mark-token add-remaining-token
 195     tuck rule-match-token* next-token,
 196     f context get (>>end)
 197     context get (>>in-rule) ;
 198
 199 M: mark-following-rule handle-rule-end
 200     nip rule-match-token* prev-token,
 201     f context get (>>in-rule) ;
 202
 203 M: mark-previous-rule handle-rule-start
 204     ?end-rule
 205     mark-token
 206     dup body-token>> prev-token,
 207     rule-match-token* next-token, ;
 208
 209 : do-escaped ( -- )
 210     escaped? get [
 211         escaped? off
 212         ! ...
 213     ] when ;
 214
 215 : check-end-delegate ( -- ? )
 216     context get parent>> [
 217         in-rule>> [
 218             dup rule-end-matches? dup [
 219                 [
 220                     swap handle-rule-end
 221                     ?end-rule
 222                     mark-token
 223                     add-remaining-token
 224                 ] keep context get parent>> in-rule>>
 225                 rule-match-token* next-token,
 226                 pop-context
 227                 seen-whitespace-end? on t
 228             ] [ drop check-escape-rule ] if
 229         ] [ f ] if*
 230     ] [ f ] if* ;
 231
 232 : handle-no-word-break ( -- )
 233     context get parent>> [
 234         in-rule>> [
 235             dup no-word-break?>> [
 236                 rule-match-token* prev-token,
 237                 pop-context
 238             ] [ drop ] if
 239         ] when*
 240     ] when* ;
 241
 242 : check-rule ( -- )
 243     ?end-rule
 244     handle-no-word-break
 245     mark-token
 246     add-remaining-token ;
 247
 248 : (check-word-break) ( -- )
 249     check-rule
 250
 251     1 current-rule-set default>> next-token, ;
 252
 253 : rule-set-empty? ( ruleset -- ? )
 254     [ rules>> ] [ keywords>> ] bi
 255     [ assoc-empty? ] bi@ and ;
 256
 257 : check-word-break ( -- ? )
 258     current-char dup blank? [
 259         drop
 260
 261         seen-whitespace-end? get [
 262             position get 1+ whitespace-end set
 263         ] unless
 264
 265         (check-word-break)
 266
 267     ] [
 268         ! Micro-optimization with incorrect semantics; we keep
 269         ! it here because jEdit mode files depend on it now...
 270         current-rule-set rule-set-empty? [
 271             drop
 272         ] [
 273             dup alpha? [
 274                 drop
 275             ] [
 276                 current-rule-set rule-set-no-word-sep* member? [
 277                     (check-word-break)
 278                 ] unless
 279             ] if
 280         ] if
 281
 282         seen-whitespace-end? on
 283     ] if
 284     escaped? off
 285     delegate-end-escaped? off t ;
 286
 287
 288 : mark-token-loop ( -- )
 289     position get line get length < [
 290         {
 291             [ check-end-delegate ]
 292             [ check-every-rule ]
 293             [ check-word-break ]
 294         } 0|| drop
 295
 296         position inc
 297         mark-token-loop
 298     ] when ;
 299
 300 : mark-remaining ( -- )
 301     line get length position set
 302     check-rule ;
 303
 304 : unwind-no-line-break ( -- )
 305     context get parent>> [
 306         in-rule>> [
 307             no-line-break?>> [
 308                 pop-context
 309                 unwind-no-line-break
 310             ] when
 311         ] when*
 312     ] when* ;
 313
 314 : tokenize-line ( line-context line rules -- line-context' seq )
 315     [
 316         "MAIN" swap at -rot
 317         init-token-marker
 318         mark-token-loop
 319         mark-remaining
 320         unwind-no-line-break
 321         context get
 322     ] { } make ;