xmode.marker: more correct faster update-match-group

[factor.git] / basis / xmode / marker / marker.factor
diff --git a/basis/xmode/marker/marker.factor b/basis/xmode/marker/marker.factor

old mode 100755 (executable)

new mode 100644 (file)

index d3a4f1e..a93283e
--- a/basis/xmode/marker/marker.factor
+++ b/basis/xmode/marker/marker.factor
@@ -1,9 +1,11 @@
  ! Copyright (C) 2008 Slava Pestov.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel namespaces make xmode.rules xmode.tokens
-xmode.marker.state xmode.marker.context xmode.utilities
-xmode.catalog sequences math assocs combinators strings
-regexp splitting ascii combinators.short-circuit accessors ;
+
+USING: accessors ascii assocs combinators
+combinators.short-circuit formatting kernel make math namespaces
+regexp regexp.parser sequences splitting strings
+xmode.marker.state xmode.rules xmode.tokens xmode.utilities ;
+
  IN: xmode.marker
  
  ! Next two words copied from parser-combinators
@@ -33,7 +35,7 @@ IN: xmode.marker
          [
              dup [ digit? ] all? [
                  current-rule-set digit-re>>
-                dup [ dupd matches? ] [ drop f ] if
+                [ dupd matches? ] [ f ] if*
              ] unless*
          ]
      } 0&& nip ;
@@ -72,31 +74,97 @@ M: rule match-position drop position get ;
  : rest-of-line ( -- str )
      line get position get tail-slice ;
  
+: match-start ( string regexp -- slice/f )
+    first-match dup [ dup from>> 0 = [ drop f ] unless ] when ;
+
  GENERIC: text-matches? ( string text -- match-count/f )
  
  M: f text-matches?
      2drop f ;
  
  M: string-matcher text-matches?
-    [
-        [ string>> ] [ ignore-case?>> ] bi string-head?
-    ] keep string>> length and ;
+    [ string>> ] [ ignore-case?>> ] bi
+    [ string-head? ] keepd length and ;
  
  M: regexp text-matches?
-    [ >string ] dip first-match dup [ to>> ] when ;
+    [ >string ] dip match-start dup [ to>> ] when ;
+
+<PRIVATE
+
+! XXX: Terrible inefficient regexp match group support
+
+: #match-groups ( regexp -- n/f )
+    raw>> [ CHAR: ( = ] count [ f ] when-zero ;
+
+: nth-index ( n obj seq -- i )
+    [ = dup [ drop 1 - dup 0 < ] when ] with find drop nip ;
+
+: match-group-regexp ( regexp n -- skip-regexp match-regexp )
+    [ [ options>> options>string ] [ raw>> ] bi ] dip
+    CHAR: ( pick nth-index cut CHAR: ) over index 1 + head
+    rot '[ _ H{ } [ <optioned-regexp> ] 2cache ] bi@ ;
+
+: skip-first-match ( match regexp -- tailseq )
+    first-match [ seq>> ] [ to>> ] bi tail ;
+
+: nth-match ( match regexp n -- slice/f )
+    match-group-regexp [ skip-first-match ] [ first-match ] bi* ;
+
+:: update-match-group ( str match regexp n -- str' )
+    n H{ } [ CHAR: 1 + CHAR: $ swap "" 2sequence ] cache :> x
+    x str subseq-range :> ( from to )
+    from [
+        to str snip-slice match regexp n nth-match glue
+    ] [ str ] if* ;
+
+: update-match-groups ( str match regexp -- str' )
+    [ >string ] dip
+    dup #match-groups [ update-match-group ] 2with each-integer ;
+
+GENERIC: fixup-end ( match regexp end -- end' )
+
+M: string-matcher fixup-end
+    [ string>> -rot update-match-groups ]
+    [ ignore-case?>> ] bi <string-matcher> ;
+
+MEMO: <fixup-regexp> ( raw matched options -- regexp )
+    <optioned-regexp> {
+        [ parse-tree>> ] [ options>> ] [ dfa>> ] [ next-match>> ]
+    } cleave regexp boa ;
  
-: rule-start-matches? ( rule -- match-count/f )
-    dup start>> tuck swap can-match-here? [
-        rest-of-line swap text>> text-matches?
+M: regexp fixup-end
+    [ raw>> [ -rot update-match-groups ] keep swap ]
+    [ options>> options>string ] bi <fixup-regexp> ;
+
+: fixup-end? ( text -- ? )
+    { [ regexp? ] [ #match-groups ] } 1&& ;
+
+: fixup-end/text-matches? ( string regexp rule -- match-count/f )
+    [ >string ] 2dip [ [ match-start dup ] keep ] dip pick [
+        end>> [ [ fixup-end ] change-text drop ] [ 2drop ] if*
+    ] [
+        3drop
+    ] if dup [ to>> ] when ;
+
+PRIVATE>
+
+:: rule-start-matches? ( rule -- match-count/f )
+    rule start>> dup rule can-match-here? [
+        rest-of-line swap text>>
+        dup fixup-end? [
+            rule fixup-end/text-matches?
+        ] [
+            text-matches?
+        ] if
      ] [
          drop f
      ] if ;
  
  : rule-end-matches? ( rule -- match-count/f )
      dup mark-following-rule? [
-        dup start>> swap can-match-here? 0 and
+        [ start>> ] keep can-match-here? 0 and
      ] [
-        dup end>> tuck swap can-match-here? [
+        [ end>> dup ] keep can-match-here? [
              rest-of-line
              swap text>> context get end>> or
              text-matches?
@@ -107,14 +175,14 @@ M: regexp text-matches?
  
  DEFER: get-rules
  
-: get-always-rules ( vector/f ruleset -- vector/f )
-    f swap rules>> at ?push-all ;
+: get-always-rules ( ruleset -- vector/f )
+    f swap rules>> at ;
  
-: get-char-rules ( vector/f char ruleset -- vector/f )
-    [ ch>upper ] dip rules>> at ?push-all ;
+: get-char-rules ( char ruleset -- vector/f )
+    [ ch>upper ] dip rules>> at ;
  
  : get-rules ( char ruleset -- seq )
-    [ f ] 2dip [ get-char-rules ] keep get-always-rules ;
+    [ get-char-rules ] [ get-always-rules ] bi [ append ] when* ;
  
  GENERIC: handle-rule-start ( match-count rule -- )
  
@@ -128,27 +196,26 @@ GENERIC: handle-rule-end ( match-count rule -- )
      ] ?if ;
  
  : check-escape-rule ( rule -- ? )
-    no-escape?>> [ f ] [
-        find-escape-rule dup [
-            dup rule-start-matches? dup [
-                swap handle-rule-start
-                delegate-end-escaped? [ not ] change
-                t
-            ] [
-                2drop f
-            ] if
-        ] when
-    ] if ;
+    escape-rule>> [ find-escape-rule ] unless*
+    dup [
+        dup rule-start-matches? [
+            swap handle-rule-start
+            delegate-end-escaped? toggle
+            t
+        ] [
+            drop f
+        ] if*
+    ] when ;
  
  : check-every-rule ( -- ? )
      current-char current-rule-set get-rules
      [ rule-start-matches? ] map-find
-    dup [ handle-rule-start t ] [ 2drop f ] if ;
+    [ handle-rule-start t ] [ drop f ] if* ;
  
  : ?end-rule ( -- )
      current-rule [
          dup rule-end-matches?
-        dup [ swap handle-rule-end ] [ 2drop ] if
+        [ swap handle-rule-end ] [ drop ] if*
      ] when* ;
  
  : rule-match-token* ( rule -- id )
@@ -162,7 +229,7 @@ M: escape-rule handle-rule-start
      drop
      ?end-rule
      process-escape? get [
-        escaped? [ not ] change
+        escaped? toggle
          position [ + ] change
      ] [ drop ] if ;
  
@@ -170,7 +237,7 @@ M: seq-rule handle-rule-start
      ?end-rule
      mark-token
      add-remaining-token
-    tuck body-token>> next-token,
+    [ body-token>> next-token, ] keep
      delegate>> [ push-context ] when* ;
  
  UNION: abstract-span-rule span-rule eol-span-rule ;
@@ -179,9 +246,9 @@ M: abstract-span-rule handle-rule-start
      ?end-rule
      mark-token
      add-remaining-token
-    tuck rule-match-token* next-token,
+    [ rule-match-token* next-token, ] keep
      ! ... end subst ...
-    dup context get (>>in-rule)
+    dup context get in-rule<<
      delegate>> push-context ;
  
  M: span-rule handle-rule-end
@@ -190,13 +257,13 @@ M: span-rule handle-rule-end
  M: mark-following-rule handle-rule-start
      ?end-rule
      mark-token add-remaining-token
-    tuck rule-match-token* next-token,
-    f context get (>>end)
-    context get (>>in-rule) ;
+    [ rule-match-token* next-token, ] keep
+    f context get end<<
+    context get in-rule<< ;
  
  M: mark-following-rule handle-rule-end
      nip rule-match-token* prev-token,
-    f context get (>>in-rule) ;
+    f context get in-rule<< ;
  
  M: mark-previous-rule handle-rule-start
      ?end-rule
@@ -213,7 +280,7 @@ M: mark-previous-rule handle-rule-start
  : check-end-delegate ( -- ? )
      context get parent>> [
          in-rule>> [
-            dup rule-end-matches? dup [
+            dup rule-end-matches? [
                  [
                      swap handle-rule-end
                      ?end-rule
@@ -223,7 +290,7 @@ M: mark-previous-rule handle-rule-start
                  rule-match-token* next-token,
                  pop-context
                  seen-whitespace-end? on t
-            ] [ drop check-escape-rule ] if
+            ] [ check-escape-rule ] if*
          ] [ f ] if*
      ] [ f ] if* ;
  
@@ -245,12 +312,12 @@ M: mark-previous-rule handle-rule-start
  
  : (check-word-break) ( -- )
      check-rule
-    
+
      1 current-rule-set default>> next-token, ;
  
  : rule-set-empty? ( ruleset -- ? )
      [ rules>> ] [ keywords>> ] bi
-    [ assoc-empty? ] bi@ and ;
+    [ assoc-empty? ] both? ;
  
  : check-word-break ( -- ? )
      current-char dup blank? [
@@ -311,7 +378,7 @@ M: mark-previous-rule handle-rule-start
  
  : tokenize-line ( line-context line rules -- line-context' seq )
      [
-        "MAIN" swap at -rot
+        "MAIN" of -rot
          init-token-marker
          mark-token-loop
          mark-remaining