-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
-locals math namespaces regexp.parser sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
-! This uses unicode.case.private for ch>upper and ch>lower
+USING: accessors arrays assocs combinators.short-circuit kernel
+math namespaces regexp.ast regexp.classes
+regexp.transition-tables sequences sets unicode vectors ;
+IN: regexp.nfa
+
+! This uses unicode for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
-IN: regexp.nfa
-ERROR: feature-is-broken feature ;
-
-SYMBOL: negation-mode
-: negated? ( -- ? ) negation-mode get 0 or odd? ;
-
-SINGLETON: eps
-
-MIXIN: traversal-flag
-SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
-SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
-SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
-SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
-SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
-SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
-SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
-SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
-SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
-
-: options ( -- obj ) current-regexp get options>> ;
-
-: option? ( obj -- ? ) options key? ;
-
-: option-on ( obj -- ) options conjoin ;
-
-: option-off ( obj -- ) options delete-at ;
-
-: next-state ( regexp -- state )
- [ state>> ] [ [ 1+ ] change-state drop ] bi ;
-
-: set-start-state ( regexp -- )
- dup stack>> [
- drop
- ] [
- [ nfa-table>> ] [ pop first ] bi* >>start-state drop
- ] if-empty ;
-
-GENERIC: nfa-node ( node -- )
-
-:: add-simple-entry ( obj class -- )
- [let* | regexp [ current-regexp get ]
- s0 [ regexp next-state ]
- s1 [ regexp next-state ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ] |
- negated? [
- s0 f obj class make-transition table add-transition
- s0 s1 <default-transition> table add-transition
- ] [
- s0 s1 obj class make-transition table add-transition
- ] if
- s0 s1 2array stack push
- t s1 table final-states>> set-at ] ;
-
-: add-traversal-flag ( flag -- )
- stack peek second
- current-regexp get nfa-traversal-flags>> push-at ;
-
-:: concatenate-nodes ( -- )
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ] |
- s1 s2 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s4 [ regexp next-state ]
- s5 [ regexp next-state ] |
- s4 s0 eps <literal-transition> table add-transition
- s4 s2 eps <literal-transition> table add-transition
- s1 s5 eps <literal-transition> table add-transition
- s3 s5 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s3 table final-states>> delete-at
- t s5 table final-states>> set-at
- s4 s5 2array stack push ] ;
-
-M: kleene-star nfa-node ( node -- )
- term>> nfa-node
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s2 [ regexp next-state ]
- s3 [ regexp next-state ]
- table [ regexp nfa-table>> ] |
- s1 table final-states>> delete-at
- t s3 table final-states>> set-at
- s1 s0 eps <literal-transition> table add-transition
- s2 s0 eps <literal-transition> table add-transition
- s2 s3 eps <literal-transition> table add-transition
- s1 s3 eps <literal-transition> table add-transition
- s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
- seq>>
- reversed-regexp option? [ <reversed> ] when
- [ [ nfa-node ] each ]
- [ length 1- [ concatenate-nodes ] times ] bi ;
-
-M: alternation nfa-node ( node -- )
- seq>>
- [ [ nfa-node ] each ]
- [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: constant nfa-node ( node -- )
- case-insensitive option? [
- dup char>> [ ch>lower ] [ ch>upper ] bi
- 2dup = [
- 2drop
- char>> literal-transition add-simple-entry
- ] [
- [ literal-transition add-simple-entry ] bi@
- alternate-nodes drop
- ] if
- ] [
- char>> literal-transition add-simple-entry
- ] if ;
+SYMBOL: option-stack
+
+SYMBOL: state
+
+: next-state ( -- state )
+ state [ get ] [ inc ] bi ;
+
+SYMBOL: nfa-table
+
+: set-each ( keys value hashtable -- )
+ '[ _ swap _ set-at ] each ;
+
+: options>hash ( options -- hashtable )
+ H{ } clone [
+ [ [ on>> t ] dip set-each ]
+ [ [ off>> f ] dip set-each ] 2bi
+ ] keep ;
+
+: using-options ( options quot -- )
+ [ options>hash option-stack [ ?push ] change ] dip
+ call option-stack get pop* ; inline
+
+: option? ( obj -- ? )
+ option-stack get assoc-stack ;
+
+GENERIC: nfa-node ( node -- start-state end-state )
+
+: add-simple-entry ( obj -- start-state end-state )
+ [ next-state next-state 2dup ] dip
+ nfa-table get add-transition ;
+
+: epsilon-transition ( source target -- )
+ epsilon nfa-table get add-transition ;
+
+M:: star nfa-node ( node -- start end )
+ node term>> nfa-node :> ( s0 s1 )
+ next-state :> s2
+ next-state :> s3
+ s1 s0 epsilon-transition
+ s2 s0 epsilon-transition
+ s2 s3 epsilon-transition
+ s1 s3 epsilon-transition
+ s2 s3 ;
+
+DEFER: modify-class
-M: epsilon nfa-node ( node -- )
- drop eps literal-transition add-simple-entry ;
+! Potential off-by-one errors when lookaround nested in lookbehind
-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+M: tagged-epsilon nfa-node
+ clone [ modify-class ] change-tag add-simple-entry ;
-M: any-char nfa-node ( node -- )
- [ dotall option? ] dip any-char-no-nl ?
- class-transition add-simple-entry ;
+M: concatenation nfa-node
+ [ first>> ] [ second>> ] bi
+ reversed-regexp option? [ swap ] when
+ [ nfa-node ] bi@
+ [ epsilon-transition ] dip ;
-! M: beginning-of-text nfa-node ( node -- ) ;
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+ next-state :> s4
+ next-state :> s5
+ s4 s0 epsilon-transition
+ s4 s2 epsilon-transition
+ s1 s5 epsilon-transition
+ s3 s5 epsilon-transition
+ s4 s5 ;
-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: alternation nfa-node
+ [ first>> ] [ second>> ] bi
+ [ nfa-node ] bi@
+ alternate-nodes ;
-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+GENERIC: modify-class ( char-class -- char-class' )
-: choose-letter-class ( node -- node' )
- case-insensitive option? Letter-class rot ? ;
+M: object modify-class ;
-M: letter-class nfa-node ( node -- )
- choose-letter-class class-transition add-simple-entry ;
+M: concatenation modify-class
+ [ first>> ] [ second>> ] bi [ modify-class ] bi@
+ concatenation boa ;
-M: LETTER-class nfa-node ( node -- )
- choose-letter-class class-transition add-simple-entry ;
+M: alternation modify-class
+ [ first>> ] [ second>> ] bi [ modify-class ] bi@
+ alternation boa ;
-M: character-class-range nfa-node ( node -- )
+M: lookahead modify-class
+ term>> modify-class lookahead boa ;
+
+M: lookbehind modify-class
+ term>> modify-class lookbehind boa ;
+
+: line-option ( multiline unix-lines default -- option )
+ multiline option? [
+ drop [ unix-lines option? ] 2dip swap ?
+ ] [ 2nip ] if ;
+
+M: $crlf modify-class
+ $unix end-of-input line-option ;
+
+M: ^crlf modify-class
+ ^unix beginning-of-input line-option ;
+
+M: integer modify-class
case-insensitive option? [
- ! This should be implemented for Unicode by case-folding
- ! the input and all strings in the regexp.
- dup [ from>> ] [ to>> ] bi
- 2dup [ Letter? ] bi@ and [
- rot drop
- [ [ ch>lower ] bi@ character-class-range boa ]
- [ [ ch>upper ] bi@ character-class-range boa ] 2bi
- [ class-transition add-simple-entry ] bi@
- alternate-nodes
- ] [
- 2drop
- class-transition add-simple-entry
- ] if
- ] [
- class-transition add-simple-entry
+ dup Letter? [
+ [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+ ] when
+ ] when ;
+
+M: primitive-class modify-class
+ class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+ seq>> [ modify-class ] map <or-class> ;
+
+M: not-class modify-class
+ class>> modify-class <not-class> ;
+
+MEMO: unix-dot ( -- class )
+ CHAR: \n <not-class> ;
+
+MEMO: nonl-dot ( -- class )
+ { CHAR: \n CHAR: \r } <or-class> <not-class> ;
+
+M: dot modify-class
+ drop dotall option? [ t ] [
+ unix-lines option?
+ unix-dot nonl-dot ?
] if ;
-M: capture-group nfa-node ( node -- )
- "capture-groups" feature-is-broken
- eps literal-transition add-simple-entry
- capture-group-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- capture-group-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-! xyzzy
-M: non-capture-group nfa-node ( node -- )
- term>> nfa-node ;
-
-M: reluctant-kleene-star nfa-node ( node -- )
- term>> <kleene-star> nfa-node ;
-
-M: negation nfa-node ( node -- )
- negation-mode inc
- term>> nfa-node
- negation-mode dec ;
-
-M: lookahead nfa-node ( node -- )
- "lookahead" feature-is-broken
- eps literal-transition add-simple-entry
- lookahead-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- lookahead-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-M: lookbehind nfa-node ( node -- )
- "lookbehind" feature-is-broken
- eps literal-transition add-simple-entry
- lookbehind-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- lookbehind-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-M: option nfa-node ( node -- )
- [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
- eps literal-transition add-simple-entry ;
-
-: construct-nfa ( regexp -- )
+: modify-letter-class ( class -- newclass )
+ case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
+
+: cased-range? ( range -- ? )
+ [ from>> ] [ to>> ] bi {
+ [ [ letter? ] both? ]
+ [ [ LETTER? ] both? ]
+ } 2|| ;
+
+M: range-class modify-class
+ case-insensitive option? [
+ dup cased-range? [
+ [ from>> ] [ to>> ] bi
+ [ [ ch>lower ] bi@ <range-class> ]
+ [ [ ch>upper ] bi@ <range-class> ] 2bi
+ 2array <or-class>
+ ] when
+ ] when ;
+
+M: object nfa-node
+ modify-class add-simple-entry ;
+
+M: with-options nfa-node
+ dup options>> [ tree>> nfa-node ] using-options ;
+
+: construct-nfa ( ast -- nfa-table )
[
- reset-regexp
- negation-mode off
- [ current-regexp set ]
- [ parse-tree>> nfa-node ]
- [ set-start-state ] tri
+ 0 state namespaces:set
+ <transition-table> nfa-table namespaces:set
+ nfa-node
+ nfa-table get
+ swap 1array fast-set >>final-states
+ swap >>start-state
] with-scope ;