: LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline\r
: digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline\r
: printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline\r
-: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline\r
+: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline\r
: quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline\r
: Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline\r
: alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline\r
: >upper ( str -- upper ) [ ch>upper ] map ;\r
\r
HINTS: >lower string ;\r
-HINTS: >upper string ;
\ No newline at end of file
+HINTS: >upper string ;\r
--- /dev/null
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel arrays accessors fry sequences regexp.classes ;
+FROM: math.ranges => [a,b] ;
+IN: regexp.ast
+
+TUPLE: negation term ;
+C: <negation> negation
+
+TUPLE: from-to n m ;
+C: <from-to> from-to
+
+TUPLE: at-least n ;
+C: <at-least> at-least
+
+SINGLETON: epsilon
+
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+ [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+ unclip [ alternation boa ] reduce ;
+
+TUPLE: star term ;
+C: <star> star
+
+TUPLE: with-options tree options ;
+C: <with-options> with-options
+
+TUPLE: options on off ;
+C: <options> options
+
+SINGLETONS: unix-lines dotall multiline comments case-insensitive
+unicode-case reversed-regexp ;
+
+: <maybe> ( term -- term' )
+ f <concatenation> 2array <alternation> ;
+
+: <plus> ( term -- term' )
+ dup <star> 2array <concatenation> ;
+
+: repetition ( n term -- term' )
+ <array> <concatenation> ;
+
+GENERIC: <times> ( term times -- term' )
+M: at-least <times>
+ n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+M: from-to <times>
+ [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+
+: char-class ( ranges ? -- term )
+ [ <or-class> ] dip [ <not-class> ] when ;
+
+TUPLE: lookahead term ;
+C: <lookahead> lookahead
+
+TUPLE: lookbehind term ;
+C: <lookbehind> lookbehind
+++ /dev/null
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors hashtables kernel math vectors ;
-IN: regexp.backend
-
-TUPLE: regexp
- raw
- { options hashtable }
- stack
- parse-tree
- nfa-table
- dfa-table
- minimized-table
- matchers
- { nfa-traversal-flags hashtable }
- { dfa-traversal-flags hashtable }
- { state integer }
- { new-states vector }
- { visited-states hashtable } ;
-
-: reset-regexp ( regexp -- regexp )
- 0 >>state
- V{ } clone >>stack
- V{ } clone >>new-states
- H{ } clone >>visited-states ;
-
-SYMBOL: current-regexp
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.classes tools.test arrays kernel ;
+IN: regexp.classes.tests
+
+[ f ] [ { 1 2 } <and-class> ] unit-test
+[ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test
+[ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test
+[ CHAR: A ] [ CHAR: A LETTER-class <primitive-class> 2array <and-class> ] unit-test
+[ CHAR: A ] [ LETTER-class <primitive-class> CHAR: A 2array <and-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class <primitive-class> 2array <or-class> ] unit-test
+[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class <primitive-class> CHAR: A 2array <or-class> ] unit-test
+[ t ] [ { t 1 } <or-class> ] unit-test
+[ t ] [ { 1 t } <or-class> ] unit-test
+[ f ] [ { f 1 } <and-class> ] unit-test
+[ f ] [ { 1 f } <and-class> ] unit-test
+[ 1 ] [ { f 1 } <or-class> ] unit-test
+[ 1 ] [ { 1 f } <or-class> ] unit-test
+[ 1 ] [ { t 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 t } <and-class> ] unit-test
+[ 1 ] [ 1 <not-class> <not-class> ] unit-test
+[ 1 ] [ { 1 1 } <and-class> ] unit-test
+[ 1 ] [ { 1 1 } <or-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
+[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order words regexp.utils
-unicode.categories combinators.short-circuit ;
+USING: accessors kernel math math.order words combinators locals
+ascii unicode.categories combinators.short-circuit sequences ;
+QUALIFIED-WITH: multi-methods m
IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
SINGLETONS: beginning-of-input beginning-of-line
end-of-input end-of-line ;
-MIXIN: node
-TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+TUPLE: range from to ;
+C: <range> range
GENERIC: class-member? ( obj class -- ? )
-M: t class-member? ( obj class -- ? ) 2drop f ;
+M: t class-member? ( obj class -- ? ) 2drop t ;
-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;
-M: character-class-range class-member? ( obj class -- ? )
+M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
M: any-char class-member? ( obj class -- ? )
M: digit-class class-member? ( obj class -- ? )
drop digit? ;
+: c-identifier-char? ( ch -- ? )
+ { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
M: c-identifier-class class-member? ( obj class -- ? )
- drop
- { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
+ drop c-identifier-char? ;
M: alpha-class class-member? ( obj class -- ? )
drop alpha? ;
+: punct? ( ch -- ? )
+ "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
M: punctuation-class class-member? ( obj class -- ? )
drop punct? ;
+: java-printable? ( ch -- ? )
+ { [ alpha? ] [ punct? ] } 1|| ;
+
M: java-printable-class class-member? ( obj class -- ? )
drop java-printable? ;
drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
M: control-character-class class-member? ( obj class -- ? )
- drop control-char? ;
+ drop control? ;
+
+: hex-digit? ( ch -- ? )
+ {
+ [ CHAR: A CHAR: F between? ]
+ [ CHAR: a CHAR: f between? ]
+ [ CHAR: 0 CHAR: 9 between? ]
+ } 1|| ;
M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ;
+: java-blank? ( ch -- ? )
+ {
+ CHAR: \s CHAR: \t CHAR: \n
+ HEX: b HEX: 7 CHAR: \r
+ } member? ;
+
M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ;
2drop f ;
M: terminator-class class-member? ( obj class -- ? )
- drop {
- [ CHAR: \r = ]
- [ CHAR: \n = ]
- [ CHAR: \u000085 = ]
- [ CHAR: \u002028 = ]
- [ CHAR: \u002029 = ]
- } 1|| ;
+ drop "\r\n\u000085\u002029\u002028" member? ;
M: beginning-of-line class-member? ( obj class -- ? )
2drop f ;
M: end-of-line class-member? ( obj class -- ? )
2drop f ;
+
+M: f class-member? 2drop f ;
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: or-class seq ;
+
+TUPLE: not-class class ;
+
+TUPLE: and-class seq ;
+
+m:GENERIC: combine-and ( class1 class2 -- combined ? )
+
+: replace-if-= ( object object -- object ? )
+ over = ;
+
+m:METHOD: combine-and { object object } replace-if-= ;
+
+m:METHOD: combine-and { integer integer }
+ 2dup = [ drop t ] [ 2drop f t ] if ;
+
+m:METHOD: combine-and { t object }
+ nip t ;
+
+m:METHOD: combine-and { f object }
+ drop t ;
+
+m:METHOD: combine-and { not-class object }
+ [ class>> ] dip = [ f t ] [ f f ] if ;
+
+m:METHOD: combine-and { integer object }
+ 2dup class-member? [ drop t ] [ 2drop f t ] if ;
+
+m:GENERIC: combine-or ( class1 class2 -- combined ? )
+
+m:METHOD: combine-or { object object } replace-if-= ;
+
+m:METHOD: combine-or { integer integer }
+ 2dup = [ drop t ] [ 2drop f f ] if ;
+
+m:METHOD: combine-or { t object }
+ drop t ;
+
+m:METHOD: combine-or { f object }
+ nip t ;
+
+m:METHOD: combine-or { not-class object }
+ [ class>> ] dip = [ t t ] [ f f ] if ;
+
+m:METHOD: combine-or { integer object }
+ 2dup class-member? [ nip t ] [ 2drop f f ] if ;
+
+: try-combine ( elt1 elt2 quot -- combined/f ? )
+ 3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline
+
+:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq )
+ f :> combined!
+ seq [ elt quot try-combine swap combined! ] find drop
+ [ seq remove-nth combined prefix ]
+ [ seq elt prefix ] if* ; inline
+
+:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq )
+ seq { } [ quot prefix-combining ] reduce
+ dup length {
+ { 0 [ drop empty ] }
+ { 1 [ first ] }
+ [ drop class new swap >>seq ]
+ } case ; inline
+
+: <and-class> ( seq -- class )
+ [ combine-and ] t and-class combine ;
+
+M: and-class class-member?
+ seq>> [ class-member? ] with all? ;
+
+: <or-class> ( seq -- class )
+ [ combine-or ] f or-class combine ;
+
+M: or-class class-member?
+ seq>> [ class-member? ] with any? ;
+
+: <not-class> ( class -- inverse )
+ {
+ { t [ f ] }
+ { f [ t ] }
+ [ dup not-class? [ class>> ] [ not-class boa ] if ]
+ } case ;
+
+M: not-class class-member?
+ class>> class-member? not ;
+
+M: primitive-class class-member?
+ class>> class-member? ;
+
+UNION: class primitive-class not-class or-class range ;
--- /dev/null
+USING: regexp.dfa tools.test ;
+IN: regexp.dfa.tests
+
+[ [ ] [ ] while-changes ] must-infer
+
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors regexp.utils sequences.deep ;
-USING: io prettyprint threads ;
+sets sorting vectors ;
IN: regexp.dfa
-: find-delta ( states transition regexp -- new-states )
- nfa-table>> transitions>>
- rot [ swap at at ] with with gather sift ;
+:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
+ obj quot call :> new-obj
+ new-obj comp call :> new-key
+ new-key old-key =
+ [ new-obj ]
+ [ new-obj quot comp new-key (while-changes) ]
+ if ; inline recursive
-: (find-epsilon-closure) ( states regexp -- new-states )
+: while-changes ( obj quot pred -- obj' )
+ 3dup nip call (while-changes) ; inline
+
+: find-delta ( states transition nfa -- new-states )
+ transitions>> '[ _ swap _ at at ] gather sift ;
+
+: (find-epsilon-closure) ( states nfa -- new-states )
eps swap find-delta ;
-: find-epsilon-closure ( states regexp -- new-states )
+: find-epsilon-closure ( states nfa -- new-states )
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
natural-sort ;
-: find-closure ( states transition regexp -- new-states )
- [ find-delta ] 2keep nip find-epsilon-closure ;
+: find-closure ( states transition nfa -- new-states )
+ [ find-delta ] keep find-epsilon-closure ;
-: find-start-state ( regexp -- state )
- [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
+: find-start-state ( nfa -- state )
+ [ start-state>> 1vector ] keep find-epsilon-closure ;
-: find-transitions ( seq1 regexp -- seq2 )
- nfa-table>> transitions>>
- [ at keys ] curry gather
+: find-transitions ( dfa-state nfa -- next-dfa-state )
+ transitions>>
+ '[ _ at keys ] gather
eps swap remove ;
-: add-todo-state ( state regexp -- )
- 2dup visited-states>> key? [
- 2drop
- ] [
- [ visited-states>> conjoin ]
- [ new-states>> push ] 2bi
+: add-todo-state ( state visited-states new-states -- )
+ 3dup drop key? [ 3drop ] [
+ [ conjoin ] [ push ] bi-curry* bi
] if ;
-: new-transitions ( regexp -- )
- dup new-states>> [
- drop
- ] [
- dupd pop dup pick find-transitions rot
- [
- [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
- [ swapd transition make-transition ] dip
- dfa-table>> add-transition
- ] curry with each
- new-transitions
+:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
+ new-states [ nfa dfa ] [
+ pop :> state
+ state nfa find-transitions
+ [| trans |
+ state trans nfa find-closure :> new-state
+ new-state visited-states new-states add-todo-state
+ state new-state trans transition make-transition dfa add-transition
+ ] each
+ nfa dfa new-states visited-states new-transitions
] if-empty ;
: states ( hashtable -- array )
[ keys ]
- [ values [ values concat ] map concat append ] bi ;
+ [ values [ values concat ] map concat ] bi
+ append ;
-: set-final-states ( regexp -- )
- dup
- [ nfa-table>> final-states>> keys ]
- [ dfa-table>> transitions>> states ] bi
- [ intersects? ] with filter
-
- swap dfa-table>> final-states>>
+: set-final-states ( nfa dfa -- )
+ [
+ [ final-states>> keys ]
+ [ transitions>> states ] bi*
+ [ intersects? ] with filter
+ ] [ final-states>> ] bi
[ conjoin ] curry each ;
-: set-initial-state ( regexp -- )
- dup
- [ dfa-table>> ] [ find-start-state ] bi
- [ >>start-state drop ] keep
- 1vector >>new-states drop ;
-
-: set-traversal-flags ( regexp -- )
- dup
- [ nfa-traversal-flags>> ]
- [ dfa-table>> transitions>> keys ] bi
- [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
- >>dfa-traversal-flags drop ;
+: initialize-dfa ( nfa -- dfa )
+ <transition-table>
+ swap find-start-state >>start-state ;
-: construct-dfa ( regexp -- )
- {
- [ set-initial-state ]
- [ new-transitions ]
- [ set-final-states ]
- [ set-traversal-flags ]
- } cleave ;
+: construct-dfa ( nfa -- dfa )
+ dup initialize-dfa
+ dup start-state>> 1vector
+ H{ } clone
+ new-transitions
+ [ set-final-states ] keep ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel accessors regexp.classes math.bits assocs sequences
+arrays sets regexp.dfa math fry regexp.minimize ;
+IN: regexp.disambiguate
+
+TUPLE: parts in out ;
+
+: make-partition ( choices classes -- partition )
+ zip [ first ] partition [ values ] bi@ parts boa ;
+
+: powerset-partition ( classes -- partitions )
+ [ length [ 2^ ] keep ] keep '[
+ _ <bits> _ make-partition
+ ] map rest ;
+
+: partition>class ( parts -- class )
+ [ out>> [ <not-class> ] map ]
+ [ in>> <and-class> ] bi
+ prefix <and-class> ;
+
+: get-transitions ( partition state-transitions -- next-states )
+ [ in>> ] dip '[ _ at ] map prune ;
+
+: disambiguate ( dfa -- nfa )
+ [
+ [
+ [ keys powerset-partition ] keep '[
+ [ partition>class ]
+ [ _ get-transitions ] bi
+ ] H{ } map>assoc
+ [ drop ] assoc-filter
+ ] assoc-map
+ ] change-transitions ;
+
+: nfa>dfa ( nfa -- dfa )
+ construct-dfa
+ minimize disambiguate
+ construct-dfa minimize ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ;
+IN: regexp.minimize.tests
+
+[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test
+[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test
+
+[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ]
+[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test
+
+[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test
+
+[ 3 ] [ R/ ab|ac/ dfa>> transitions>> assoc-size ] unit-test
+[ 3 ] [ R/ a(b|c)/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ ((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ a|((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test
+[ 2 ] [ R/ ab|((aa*)*)*b/ dfa>> transitions>> assoc-size ] unit-test
+[ 4 ] [ R/ ab|cd/ dfa>> transitions>> assoc-size ] unit-test
+[ 1 ] [ R/ [a-z]*|[A-Z]*/i dfa>> transitions>> assoc-size ] unit-test
+
+[
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } { CHAR: b 1 } } }
+ { 1 H{ { CHAR: a 2 } { CHAR: b 2 } } }
+ { 2 H{ { CHAR: c 3 } } }
+ { 3 H{ } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 3 3 } } }
+ }
+] [
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } { CHAR: b 4 } } }
+ { 1 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+ { 2 H{ { CHAR: c 3 } } }
+ { 3 H{ } }
+ { 4 H{ { CHAR: a 2 } { CHAR: b 5 } } }
+ { 5 H{ { CHAR: c 6 } } }
+ { 6 H{ } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 3 3 } { 6 6 } } }
+ } combine-states
+] unit-test
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel sequences regexp.transition-tables fry assocs
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
+IN: regexp.minimize
+
+: number-transitions ( transitions numbering -- new-transitions )
+ dup '[
+ [ _ at ]
+ [ [ first _ at ] assoc-map ] bi*
+ ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+ transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+ '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+ [
+ [ '[ _ at ] change-start-state ]
+ [ '[ [ _ at ] map-set ] change-final-states ]
+ [ ] tri
+ ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+ dup table>state-numbers
+ [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+ {
+ [ drop <= ]
+ [ transitions>> '[ _ at keys ] bi@ set= ]
+ [ final-states>> '[ _ key? ] bi@ = ]
+ } 3&& ;
+
+:: initialize-partitions ( transition-table -- partitions )
+ ! Partition table is sorted-array => ?
+ H{ } clone :> out
+ transition-table transitions>> keys :> states
+ states [| s1 |
+ states [| s2 |
+ s1 s2 transition-table initially-same?
+ [ s1 s2 2array out conjoin ] when
+ ] each
+ ] each out ;
+
+: same-partition? ( s1 s2 partitions -- ? )
+ [ 2array natural-sort ] dip key? ;
+
+: assemble-values ( assoc1 assoc2 -- values )
+ dup keys '[ _ swap [ at ] curry map ] bi@ zip ;
+
+: stay-same? ( s1 s2 transition partitions -- ? )
+ [ '[ _ transitions>> at ] bi@ assemble-values ] dip
+ '[ _ same-partition? ] assoc-all? ;
+
+: partition-more ( partitions transition-table -- partitions )
+ over '[ drop first2 _ _ stay-same? ] assoc-filter ;
+
+: partition>classes ( partitions -- synonyms ) ! old-state => new-state
+ >alist sort-keys
+ [ drop first2 swap ] assoc-map
+ <reversed>
+ >hashtable ;
+
+: state-classes ( transition-table -- synonyms )
+ [ initialize-partitions ] keep
+ '[ _ partition-more ] [ assoc-size ] while-changes
+ partition>classes ;
+
+: canonical-state? ( state state-classes -- ? )
+ dupd at = ;
+
+: delete-duplicates ( transitions state-classes -- new-transitions )
+ '[ drop _ canonical-state? ] assoc-filter ;
+
+: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
+ '[ [ _ at ] assoc-map ] assoc-map ;
+
+: combine-transitions ( transitions state-classes -- new-transitions )
+ [ delete-duplicates ] [ rewrite-duplicates ] bi ;
+
+: combine-states ( table -- smaller-table )
+ dup state-classes
+ [ combine-transitions ] rewrite-transitions ;
+
+: minimize ( table -- minimal-table )
+ clone number-states combine-states ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+ ! R/ |[^a]|.+/
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } { T{ not-class f CHAR: a } -1 } } }
+ { 1 H{ { t -1 } } }
+ { -1 H{ { t -1 } } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 0 0 } { -1 -1 } } }
+ }
+] [
+ ! R/ a/
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } } }
+ { 1 H{ } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 1 1 } } }
+ } negate-table
+] unit-test
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.disambiguate kernel sequences
+assocs regexp.classes hashtables accessors fry vectors
+regexp.ast regexp.transition-tables regexp.minimize ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+ construct-nfa nfa>dfa ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+ clone dup
+ [ [ fail-state ] dip keys [ <not-class> ] map <and-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+ clone dup
+ [ fail-state t associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+ [ add-default-transition ] assoc-map
+ fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+ [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+ [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+ clone
+ [ add-fail-state ] change-transitions
+ dup inverse-final-states >>final-states ;
+
+: renumber-transitions ( transitions numbering -- new-transitions )
+ dup '[
+ [ _ at ]
+ [ [ [ _ at ] map ] assoc-map ] bi*
+ ] assoc-map ;
+
+: renumber-states ( transition-table -- transition-table )
+ dup transitions>> keys [ next-state ] H{ } map>assoc
+ [ renumber-transitions ] rewrite-transitions ;
+
+: box-transitions ( transition-table -- transition-table )
+ [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ;
+
+: unify-final-state ( transition-table -- transition-table )
+ dup [ final-states>> keys ] keep
+ '[ -2 eps <literal-transition> _ add-transition ] each
+ H{ { -2 -2 } } >>final-states ;
+
+: adjoin-dfa ( transition-table -- start end )
+ box-transitions unify-final-state renumber-states
+ [ start-state>> ]
+ [ final-states>> keys first ]
+ [ table [ transitions>> ] bi@ swap update ] tri ;
+
+M: negation nfa-node ( node -- start end )
+ term>> ast>dfa negate-table adjoin-dfa ;
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
-locals math namespaces regexp.parser sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
+USING: accessors arrays assocs grouping kernel
+locals math namespaces sequences fry quotations
+math.order math.ranges vectors unicode.categories
+regexp.transition-tables words sets hashtables combinators.short-circuit
+unicode.case.private regexp.ast regexp.classes ;
+IN: regexp.nfa
+
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
-IN: regexp.nfa
-ERROR: feature-is-broken feature ;
+GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
+! This is unfinished and does nothing right now!
+
+M: object remove-lookahead ;
-SYMBOL: negation-mode
-: negated? ( -- ? ) negation-mode get 0 or odd? ;
+M: with-options remove-lookahead
+ [ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
+
+M: alternation remove-lookahead
+ [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ;
+
+M: concatenation remove-lookahead ;
SINGLETON: eps
-MIXIN: traversal-flag
-SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
-SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
-SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
-SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
-SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
-SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
-SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
-SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
-SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
-
-: options ( -- obj ) current-regexp get options>> ;
-
-: option? ( obj -- ? ) options key? ;
-
-: option-on ( obj -- ) options conjoin ;
-
-: option-off ( obj -- ) options delete-at ;
-
-: next-state ( regexp -- state )
- [ state>> ] [ [ 1+ ] change-state drop ] bi ;
-
-: set-start-state ( regexp -- )
- dup stack>> [
- drop
- ] [
- [ nfa-table>> ] [ pop first ] bi* >>start-state drop
- ] if-empty ;
-
-GENERIC: nfa-node ( node -- )
-
-:: add-simple-entry ( obj class -- )
- [let* | regexp [ current-regexp get ]
- s0 [ regexp next-state ]
- s1 [ regexp next-state ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ] |
- negated? [
- s0 f obj class make-transition table add-transition
- s0 s1 <default-transition> table add-transition
- ] [
- s0 s1 obj class make-transition table add-transition
- ] if
- s0 s1 2array stack push
- t s1 table final-states>> set-at ] ;
-
-: add-traversal-flag ( flag -- )
- stack peek second
- current-regexp get nfa-traversal-flags>> push-at ;
-
-:: concatenate-nodes ( -- )
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ] |
- s1 s2 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- table [ regexp nfa-table>> ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s4 [ regexp next-state ]
- s5 [ regexp next-state ] |
- s4 s0 eps <literal-transition> table add-transition
- s4 s2 eps <literal-transition> table add-transition
- s1 s5 eps <literal-transition> table add-transition
- s3 s5 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s3 table final-states>> delete-at
- t s5 table final-states>> set-at
- s4 s5 2array stack push ] ;
-
-M: kleene-star nfa-node ( node -- )
- term>> nfa-node
- [let* | regexp [ current-regexp get ]
- stack [ regexp stack>> ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s2 [ regexp next-state ]
- s3 [ regexp next-state ]
- table [ regexp nfa-table>> ] |
- s1 table final-states>> delete-at
- t s3 table final-states>> set-at
- s1 s0 eps <literal-transition> table add-transition
- s2 s0 eps <literal-transition> table add-transition
- s2 s3 eps <literal-transition> table add-transition
- s1 s3 eps <literal-transition> table add-transition
- s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
- seq>>
- reversed-regexp option? [ <reversed> ] when
- [ [ nfa-node ] each ]
- [ length 1- [ concatenate-nodes ] times ] bi ;
-
-M: alternation nfa-node ( node -- )
- seq>>
- [ [ nfa-node ] each ]
- [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: constant nfa-node ( node -- )
- case-insensitive option? [
- dup char>> [ ch>lower ] [ ch>upper ] bi
- 2dup = [
- 2drop
- char>> literal-transition add-simple-entry
- ] [
- [ literal-transition add-simple-entry ] bi@
- alternate-nodes drop
- ] if
- ] [
- char>> literal-transition add-simple-entry
- ] if ;
-
-M: epsilon nfa-node ( node -- )
+SYMBOL: option-stack
+
+SYMBOL: state
+
+: next-state ( -- state )
+ state [ get ] [ inc ] bi ;
+
+SYMBOL: nfa-table
+: table ( -- table ) nfa-table get ;
+
+: set-each ( keys value hashtable -- )
+ '[ _ swap _ set-at ] each ;
+
+: options>hash ( options -- hashtable )
+ H{ } clone [
+ [ [ on>> t ] dip set-each ]
+ [ [ off>> f ] dip set-each ] 2bi
+ ] keep ;
+
+: using-options ( options quot -- )
+ [ options>hash option-stack [ ?push ] change ] dip
+ call option-stack get pop* ; inline
+
+: option? ( obj -- ? )
+ option-stack get assoc-stack ;
+
+GENERIC: nfa-node ( node -- start-state end-state )
+
+: add-simple-entry ( obj class -- start-state end-state )
+ [ next-state next-state 2dup ] 2dip
+ make-transition table add-transition ;
+
+: epsilon-transition ( source target -- )
+ eps <literal-transition> table add-transition ;
+
+M:: star nfa-node ( node -- start end )
+ node term>> nfa-node :> s1 :> s0
+ next-state :> s2
+ next-state :> s3
+ s1 s0 epsilon-transition
+ s2 s0 epsilon-transition
+ s2 s3 epsilon-transition
+ s1 s3 epsilon-transition
+ s2 s3 ;
+
+M: epsilon nfa-node
drop eps literal-transition add-simple-entry ;
-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+M: concatenation nfa-node ( node -- start end )
+ [ first>> ] [ second>> ] bi
+ reversed-regexp option? [ swap ] when
+ [ nfa-node ] bi@
+ [ epsilon-transition ] dip ;
+
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+ next-state :> s4
+ next-state :> s5
+ s4 s0 epsilon-transition
+ s4 s2 epsilon-transition
+ s1 s5 epsilon-transition
+ s3 s5 epsilon-transition
+ s4 s5 ;
+
+M: alternation nfa-node ( node -- start end )
+ [ first>> ] [ second>> ] bi
+ [ nfa-node ] bi@
+ alternate-nodes ;
-M: any-char nfa-node ( node -- )
- [ dotall option? ] dip any-char-no-nl ?
- class-transition add-simple-entry ;
+GENERIC: modify-class ( char-class -- char-class' )
-! M: beginning-of-text nfa-node ( node -- ) ;
+M: object modify-class ;
-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: integer modify-class
+ case-insensitive option? [
+ dup Letter? [
+ [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+ ] when
+ ] when ;
+
+M: integer nfa-node ( node -- start end )
+ modify-class dup class?
+ class-transition literal-transition ?
+ add-simple-entry ;
+
+M: primitive-class modify-class
+ class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+ seq>> [ modify-class ] map <or-class> ;
-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: not-class modify-class
+ class>> modify-class <not-class> ;
-: choose-letter-class ( node -- node' )
- case-insensitive option? Letter-class rot ? ;
+M: any-char modify-class
+ drop dotall option? t any-char-no-nl ? ;
-M: letter-class nfa-node ( node -- )
- choose-letter-class class-transition add-simple-entry ;
+: modify-letter-class ( class -- newclass )
+ case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
-M: LETTER-class nfa-node ( node -- )
- choose-letter-class class-transition add-simple-entry ;
+: cased-range? ( range -- ? )
+ [ from>> ] [ to>> ] bi {
+ [ [ letter? ] bi@ and ]
+ [ [ LETTER? ] bi@ and ]
+ } 2|| ;
-M: character-class-range nfa-node ( node -- )
+M: range modify-class
case-insensitive option? [
- ! This should be implemented for Unicode by case-folding
- ! the input and all strings in the regexp.
- dup [ from>> ] [ to>> ] bi
- 2dup [ Letter? ] bi@ and [
- rot drop
- [ [ ch>lower ] bi@ character-class-range boa ]
- [ [ ch>upper ] bi@ character-class-range boa ] 2bi
- [ class-transition add-simple-entry ] bi@
- alternate-nodes
- ] [
- 2drop
- class-transition add-simple-entry
- ] if
- ] [
- class-transition add-simple-entry
- ] if ;
-
-M: capture-group nfa-node ( node -- )
- "capture-groups" feature-is-broken
- eps literal-transition add-simple-entry
- capture-group-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- capture-group-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-! xyzzy
-M: non-capture-group nfa-node ( node -- )
- term>> nfa-node ;
-
-M: reluctant-kleene-star nfa-node ( node -- )
- term>> <kleene-star> nfa-node ;
-
-M: negation nfa-node ( node -- )
- negation-mode inc
- term>> nfa-node
- negation-mode dec ;
-
-M: lookahead nfa-node ( node -- )
- "lookahead" feature-is-broken
- eps literal-transition add-simple-entry
- lookahead-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- lookahead-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-M: lookbehind nfa-node ( node -- )
- "lookbehind" feature-is-broken
- eps literal-transition add-simple-entry
- lookbehind-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- lookbehind-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
-
-M: option nfa-node ( node -- )
- [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
- eps literal-transition add-simple-entry ;
-
-: construct-nfa ( regexp -- )
+ dup cased-range? [
+ [ from>> ] [ to>> ] bi
+ [ [ ch>lower ] bi@ <range> ]
+ [ [ ch>upper ] bi@ <range> ] 2bi
+ 2array <or-class>
+ ] when
+ ] when ;
+
+M: class nfa-node
+ modify-class class-transition add-simple-entry ;
+
+M: with-options nfa-node ( node -- start end )
+ dup options>> [ tree>> nfa-node ] using-options ;
+
+: construct-nfa ( ast -- nfa-table )
[
- reset-regexp
- negation-mode off
- [ current-regexp set ]
- [ parse-tree>> nfa-node ]
- [ set-start-state ] tri
+ 0 state set
+ <transition-table> nfa-table set
+ remove-lookahead nfa-node
+ table
+ swap dup associate >>final-states
+ swap >>start-state
] with-scope ;
-USING: kernel tools.test regexp.backend regexp ;
-IN: regexp.parser
+USING: kernel tools.test regexp.parser fry sequences ;
+IN: regexp.parser.tests
-: test-regexp ( string -- )
- default-regexp parse-regexp ;
+: regexp-parses ( string -- )
+ [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ;
-! [ "(" ] [ unmatched-parentheses? ] must-fail-with
+: regexp-fails ( string -- )
+ '[ _ parse-regexp ] must-fail ;
-[ ] [ "a|b" test-regexp ] unit-test
-[ ] [ "a.b" test-regexp ] unit-test
-[ ] [ "a|b|c" test-regexp ] unit-test
-[ ] [ "abc|b" test-regexp ] unit-test
-[ ] [ "a|bcd" test-regexp ] unit-test
-[ ] [ "a|(b)" test-regexp ] unit-test
-[ ] [ "(a)|b" test-regexp ] unit-test
-[ ] [ "(a|b)" test-regexp ] unit-test
-[ ] [ "((a)|(b))" test-regexp ] unit-test
+{
+ "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
+ "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
+ "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
+ "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
+ "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
+ "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
+ "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
+ "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w"
+} [ regexp-parses ] each
-[ ] [ "(?:a)" test-regexp ] unit-test
-[ ] [ "(?i:a)" test-regexp ] unit-test
-[ ] [ "(?-i:a)" test-regexp ] unit-test
-[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with
-
-[ ] [ "(?=a)" test-regexp ] unit-test
-
-[ ] [ "[abc]" test-regexp ] unit-test
-[ ] [ "[a-c]" test-regexp ] unit-test
-[ ] [ "[^a-c]" test-regexp ] unit-test
-[ "[^]" test-regexp ] must-fail
-
-[ ] [ "|b" test-regexp ] unit-test
-[ ] [ "b|" test-regexp ] unit-test
-[ ] [ "||" test-regexp ] unit-test
+{
+ "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}"
+ "\\ueeeg" "\\0339" "\\xfg"
+} [ regexp-fails ] each
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs combinators io io.streams.string
-kernel math math.parser namespaces sets
-quotations sequences splitting vectors math.order
-strings regexp.backend regexp.utils
-unicode.case unicode.categories words locals regexp.classes ;
+USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
+combinators regexp.classes strings splitting peg locals accessors
+regexp.ast ;
IN: regexp.parser
-FROM: math.ranges => [a,b] ;
-
-TUPLE: concatenation seq ; INSTANCE: concatenation node
-TUPLE: alternation seq ; INSTANCE: alternation node
-TUPLE: kleene-star term ; INSTANCE: kleene-star node
-
-! !!!!!!!!
-TUPLE: possessive-question term ; INSTANCE: possessive-question node
-TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
-
-! !!!!!!!!
-TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
-TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
-
-TUPLE: negation term ; INSTANCE: negation node
-TUPLE: constant char ; INSTANCE: constant node
-TUPLE: range from to ; INSTANCE: range node
-
-MIXIN: parentheses-group
-TUPLE: lookahead term ; INSTANCE: lookahead node
-INSTANCE: lookahead parentheses-group
-TUPLE: lookbehind term ; INSTANCE: lookbehind node
-INSTANCE: lookbehind parentheses-group
-TUPLE: capture-group term ; INSTANCE: capture-group node
-INSTANCE: capture-group parentheses-group
-TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
-INSTANCE: non-capture-group parentheses-group
-TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
-INSTANCE: independent-group parentheses-group
-TUPLE: comment-group term ; INSTANCE: comment-group node
-INSTANCE: comment-group parentheses-group
-
-SINGLETON: epsilon INSTANCE: epsilon node
-
-TUPLE: option option on? ; INSTANCE: option node
-
-SINGLETONS: unix-lines dotall multiline comments case-insensitive
-unicode-case reversed-regexp ;
-
-SINGLETONS: beginning-of-character-class end-of-character-class
-left-parenthesis pipe caret dash ;
-
-: push1 ( obj -- ) input-stream get stream>> push ;
-: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
-: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
-: drop1 ( -- ) read1 drop ;
-
-: stack ( -- obj ) current-regexp get stack>> ;
-: change-whole-stack ( quot -- )
- current-regexp get
- [ stack>> swap call ] keep (>>stack) ; inline
-: push-stack ( obj -- ) stack push ;
-: pop-stack ( -- obj ) stack pop ;
-: cut-out ( vector n -- vector' vector ) cut rest ;
-ERROR: cut-stack-error ;
-: cut-stack ( obj vector -- vector' vector )
- [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
-
-: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
-: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
-: <possessive-question> ( obj -- kleene ) possessive-question boa ;
-: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
-
-: <negation> ( obj -- negation ) negation boa ;
-: <concatenation> ( seq -- concatenation )
- >vector [ epsilon ] [ concatenation boa ] if-empty ;
-: <alternation> ( seq -- alternation ) >vector alternation boa ;
-: <capture-group> ( obj -- capture-group ) capture-group boa ;
-: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
-: <constant> ( obj -- constant ) constant boa ;
-
-: first|concatenation ( seq -- first/concatenation )
- dup length 1 = [ first ] [ <concatenation> ] if ;
-
-: first|alternation ( seq -- first/alternation )
- dup length 1 = [ first ] [ <alternation> ] if ;
-
-: <character-class-range> ( from to -- obj )
- 2dup <
- [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
-
-ERROR: unmatched-parentheses ;
-
-ERROR: unknown-regexp-option option ;
+: allowed-char? ( ch -- ? )
+ ".()|[*+?" member? not ;
-: ch>option ( ch -- singleton )
+ERROR: bad-number ;
+
+: ensure-number ( n -- n )
+ [ bad-number ] unless* ;
+
+:: at-error ( key assoc quot: ( key -- replacement ) -- value )
+ key assoc at* [ drop key quot call ] unless ; inline
+
+ERROR: bad-class name ;
+
+: name>class ( name -- class )
+ {
+ { "Lower" letter-class }
+ { "Upper" LETTER-class }
+ { "Alpha" Letter-class }
+ { "ASCII" ascii-class }
+ { "Digit" digit-class }
+ { "Alnum" alpha-class }
+ { "Punct" punctuation-class }
+ { "Graph" java-printable-class }
+ { "Print" java-printable-class }
+ { "Blank" non-newline-blank-class }
+ { "Cntrl" control-character-class }
+ { "XDigit" hex-digit-class }
+ { "Space" java-blank-class }
+ ! TODO: unicode-character-class
+ } [ bad-class ] at-error ;
+
+: lookup-escape ( char -- ast )
{
- { CHAR: i [ case-insensitive ] }
- { CHAR: d [ unix-lines ] }
- { CHAR: m [ multiline ] }
- { CHAR: n [ multiline ] }
- { CHAR: r [ reversed-regexp ] }
- { CHAR: s [ dotall ] }
- { CHAR: u [ unicode-case ] }
- { CHAR: x [ comments ] }
- [ unknown-regexp-option ]
+ { CHAR: t [ CHAR: \t ] }
+ { CHAR: n [ CHAR: \n ] }
+ { CHAR: r [ CHAR: \r ] }
+ { CHAR: f [ HEX: c ] }
+ { CHAR: a [ HEX: 7 ] }
+ { CHAR: e [ HEX: 1b ] }
+ { CHAR: \\ [ CHAR: \\ ] }
+
+ { CHAR: w [ c-identifier-class <primitive-class> ] }
+ { CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
+ { CHAR: s [ java-blank-class <primitive-class> ] }
+ { CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
+ { CHAR: d [ digit-class <primitive-class> ] }
+ { CHAR: D [ digit-class <primitive-class> <not-class> ] }
+
+ [ ]
} case ;
+: options-assoc ( -- assoc )
+ H{
+ { CHAR: i case-insensitive }
+ { CHAR: d unix-lines }
+ { CHAR: m multiline }
+ { CHAR: n multiline }
+ { CHAR: r reversed-regexp }
+ { CHAR: s dotall }
+ { CHAR: u unicode-case }
+ { CHAR: x comments }
+ } ;
+
+: ch>option ( ch -- singleton )
+ options-assoc at ;
+
: option>ch ( option -- string )
- {
- { case-insensitive [ CHAR: i ] }
- { multiline [ CHAR: m ] }
- { reversed-regexp [ CHAR: r ] }
- { dotall [ CHAR: s ] }
- [ unknown-regexp-option ]
- } case ;
+ options-assoc value-at ;
-: toggle-option ( ch ? -- )
- [ ch>option ] dip option boa push-stack ;
-
-: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
-
-: parse-options ( string -- )
- "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
-
-ERROR: bad-special-group string ;
-
-DEFER: (parse-regexp)
-: nested-parse-regexp ( token ? -- )
- [ push-stack (parse-regexp) pop-stack ] dip
- [ <negation> ] when pop-stack new swap >>term push-stack ;
-
-! non-capturing groups
-: (parse-special-group) ( -- )
- read1 {
- { [ dup CHAR: # = ] ! comment
- [ drop comment-group f nested-parse-regexp pop-stack drop ] }
- { [ dup CHAR: : = ]
- [ drop non-capture-group f nested-parse-regexp ] }
- { [ dup CHAR: = = ]
- [ drop lookahead f nested-parse-regexp ] }
- { [ dup CHAR: ! = ]
- [ drop lookahead t nested-parse-regexp ] }
- { [ dup CHAR: > = ]
- [ drop non-capture-group f nested-parse-regexp ] }
- { [ dup CHAR: < = peek1 CHAR: = = and ]
- [ drop drop1 lookbehind f nested-parse-regexp ] }
- { [ dup CHAR: < = peek1 CHAR: ! = and ]
- [ drop drop1 lookbehind t nested-parse-regexp ] }
- [
- ":)" read-until
- [ swap prefix ] dip
- {
- { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
- { CHAR: ) [ parse-options ] }
- [ drop bad-special-group ]
- } case
- ]
- } cond ;
-
-: handle-left-parenthesis ( -- )
- peek1 CHAR: ? =
- [ drop1 (parse-special-group) ]
- [ capture-group f nested-parse-regexp ] if ;
-
-: handle-dot ( -- ) any-char push-stack ;
-: handle-pipe ( -- ) pipe push-stack ;
-: (handle-star) ( obj -- kleene-star )
- peek1 {
- { CHAR: + [ drop1 <possessive-kleene-star> ] }
- { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
- [ drop <kleene-star> ]
- } case ;
-: handle-star ( -- ) stack pop (handle-star) push-stack ;
-: handle-question ( -- )
- stack pop peek1 {
- { CHAR: + [ drop1 <possessive-question> ] }
- { CHAR: ? [ drop1 <reluctant-question> ] }
- [ drop epsilon 2array <alternation> ]
- } case push-stack ;
-: handle-plus ( -- )
- stack pop dup (handle-star)
- 2array <concatenation> push-stack ;
-
-ERROR: unmatched-brace ;
-: parse-repetition ( -- start finish ? )
- "}" read-until [ unmatched-brace ] unless
- [ "," split1 [ string>number ] bi@ ]
- [ CHAR: , swap index >boolean ] bi ;
-
-: replicate/concatenate ( n obj -- obj' )
- over zero? [ 2drop epsilon ]
- [ <repetition> first|concatenation ] if ;
-
-: exactly-n ( n -- )
- stack pop replicate/concatenate push-stack ;
-
-: at-least-n ( n -- )
- stack pop
- [ replicate/concatenate ] keep
- <kleene-star> 2array <concatenation> push-stack ;
-
-: at-most-n ( n -- )
- 1+
- stack pop
- [ replicate/concatenate ] curry map <alternation> push-stack ;
-
-: from-m-to-n ( m n -- )
- [a,b]
- stack pop
- [ replicate/concatenate ] curry map
- <alternation> push-stack ;
-
-ERROR: invalid-range a b ;
-
-: handle-left-brace ( -- )
- parse-repetition
- [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
- [
- 2dup and [ from-m-to-n ]
- [ [ nip at-most-n ] [ at-least-n ] if* ] if
- ] [ drop 0 max exactly-n ] if ;
-
-: handle-front-anchor ( -- ) beginning-of-line push-stack ;
-: handle-back-anchor ( -- ) end-of-line push-stack ;
-
-ERROR: bad-character-class obj ;
-ERROR: expected-posix-class ;
-
-: parse-posix-class ( -- obj )
- read1 CHAR: { = [ expected-posix-class ] unless
- "}" read-until [ bad-character-class ] unless
- {
- { "Lower" [ letter-class ] }
- { "Upper" [ LETTER-class ] }
- { "Alpha" [ Letter-class ] }
- { "ASCII" [ ascii-class ] }
- { "Digit" [ digit-class ] }
- { "Alnum" [ alpha-class ] }
- { "Punct" [ punctuation-class ] }
- { "Graph" [ java-printable-class ] }
- { "Print" [ java-printable-class ] }
- { "Blank" [ non-newline-blank-class ] }
- { "Cntrl" [ control-character-class ] }
- { "XDigit" [ hex-digit-class ] }
- { "Space" [ java-blank-class ] }
- ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
- [ bad-character-class ]
- } case ;
+: parse-options ( on off -- options )
+ [ [ ch>option ] { } map-as ] bi@ <options> ;
-: parse-octal ( -- n ) 3 read oct> check-octal ;
-: parse-short-hex ( -- n ) 2 read hex> check-hex ;
-: parse-long-hex ( -- n ) 6 read hex> check-hex ;
-: parse-control-character ( -- n ) read1 ;
+: string>options ( string -- options )
+ "-" split1 parse-options ;
+
+: options>string ( options -- string )
+ [ on>> ] [ off>> ] bi
+ [ [ option>ch ] map ] bi@
+ [ "-" glue ] unless-empty
+ "" like ;
-ERROR: bad-escaped-literals seq ;
+! TODO: add syntax for various parenthized things,
+! add greedy and nongreedy forms of matching
+! (once it's all implemented)
-: parse-til-E ( -- obj )
- "\\E" read-until [ bad-escaped-literals ] unless ;
-
-:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
- parse-til-E
- drop1
- [ epsilon ] [
- quot call [ <constant> ] V{ } map-as
- first|concatenation
- ] if-empty ; inline
+EBNF: parse-regexp
-: parse-escaped-literals ( -- obj )
- [ ] (parse-escaped-literals) ;
+CharacterInBracket = !("}") Character
-: lower-case-literals ( -- obj )
- [ >lower ] (parse-escaped-literals) ;
+QuotedCharacter = !("\\E") .
-: upper-case-literals ( -- obj )
- [ >upper ] (parse-escaped-literals) ;
+Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
+ | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+ | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
+ | "u" Character:a Character:b Character:c Character:d
+ => [[ { a b c d } hex> ensure-number ]]
+ | "x" Character:a Character:b
+ => [[ { a b } hex> ensure-number ]]
+ | "0" Character:a Character:b Character:c
+ => [[ { a b c } oct> ensure-number ]]
+ | . => [[ lookup-escape ]]
-: parse-escaped ( -- obj )
- read1
- {
- { CHAR: t [ CHAR: \t <constant> ] }
- { CHAR: n [ CHAR: \n <constant> ] }
- { CHAR: r [ CHAR: \r <constant> ] }
- { CHAR: f [ HEX: c <constant> ] }
- { CHAR: a [ HEX: 7 <constant> ] }
- { CHAR: e [ HEX: 1b <constant> ] }
-
- { CHAR: w [ c-identifier-class ] }
- { CHAR: W [ c-identifier-class <negation> ] }
- { CHAR: s [ java-blank-class ] }
- { CHAR: S [ java-blank-class <negation> ] }
- { CHAR: d [ digit-class ] }
- { CHAR: D [ digit-class <negation> ] }
-
- { CHAR: p [ parse-posix-class ] }
- { CHAR: P [ parse-posix-class <negation> ] }
- { CHAR: x [ parse-short-hex <constant> ] }
- { CHAR: u [ parse-long-hex <constant> ] }
- { CHAR: 0 [ parse-octal <constant> ] }
- { CHAR: c [ parse-control-character ] }
-
- { CHAR: Q [ parse-escaped-literals ] }
-
- ! { CHAR: b [ word-boundary-class ] }
- ! { CHAR: B [ word-boundary-class <negation> ] }
- ! { CHAR: A [ handle-beginning-of-input ] }
- ! { CHAR: z [ handle-end-of-input ] }
-
- ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
-
- ! m//g mode
- ! { CHAR: G [ end of previous match ] }
-
- ! Group capture
- ! { CHAR: 1 [ CHAR: 1 <constant> ] }
- ! { CHAR: 2 [ CHAR: 2 <constant> ] }
- ! { CHAR: 3 [ CHAR: 3 <constant> ] }
- ! { CHAR: 4 [ CHAR: 4 <constant> ] }
- ! { CHAR: 5 [ CHAR: 5 <constant> ] }
- ! { CHAR: 6 [ CHAR: 6 <constant> ] }
- ! { CHAR: 7 [ CHAR: 7 <constant> ] }
- ! { CHAR: 8 [ CHAR: 8 <constant> ] }
- ! { CHAR: 9 [ CHAR: 9 <constant> ] }
-
- ! Perl extensions
- ! can't do \l and \u because \u is already a 4-hex
- { CHAR: L [ lower-case-literals ] }
- { CHAR: U [ upper-case-literals ] }
-
- [ <constant> ]
- } case ;
+EscapeSequence = "\\" Escape:e => [[ e ]]
-: handle-escape ( -- ) parse-escaped push-stack ;
-
-: handle-dash ( vector -- vector' )
- H{ { dash CHAR: - } } substitute ;
-
-: character-class>alternation ( seq -- alternation )
- [ dup number? [ <constant> ] when ] map first|alternation ;
-
-: handle-caret ( vector -- vector' )
- dup [ length 2 >= ] [ first caret eq? ] bi and [
- rest-slice character-class>alternation <negation>
- ] [
- character-class>alternation
- ] if ;
-
-: make-character-class ( -- character-class )
- [ beginning-of-character-class swap cut-stack ] change-whole-stack
- handle-dash handle-caret ;
-
-: apply-dash ( -- )
- stack [ pop3 nip <character-class-range> ] keep push ;
-
-: apply-dash? ( -- ? )
- stack dup length 3 >=
- [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
-
-ERROR: empty-negated-character-class ;
-DEFER: handle-left-bracket
-: (parse-character-class) ( -- )
- read1 [ empty-negated-character-class ] unless* {
- { CHAR: [ [ handle-left-bracket t ] }
- { CHAR: ] [ make-character-class push-stack f ] }
- { CHAR: - [ dash push-stack t ] }
- { CHAR: \ [ parse-escaped push-stack t ] }
- [ push-stack apply-dash? [ apply-dash ] when t ]
- } case
- [ (parse-character-class) ] when ;
-
-: push-constant ( ch -- ) <constant> push-stack ;
-
-: parse-character-class-second ( -- )
- read1 {
- { CHAR: [ [ CHAR: [ push-constant ] }
- { CHAR: ] [ CHAR: ] push-constant ] }
- { CHAR: - [ CHAR: - push-constant ] }
- [ push1 ]
- } case ;
+Character = EscapeSequence | . ?[ allowed-char? ]?
-: parse-character-class-first ( -- )
- read1 {
- { CHAR: ^ [ caret push-stack parse-character-class-second ] }
- { CHAR: [ [ CHAR: [ push-constant ] }
- { CHAR: ] [ CHAR: ] push-constant ] }
- { CHAR: - [ CHAR: - push-constant ] }
- [ push1 ]
- } case ;
+AnyRangeCharacter = EscapeSequence | .
-: handle-left-bracket ( -- )
- beginning-of-character-class push-stack
- parse-character-class-first (parse-character-class) ;
+RangeCharacter = !("]") AnyRangeCharacter
-: finish-regexp-parse ( stack -- obj )
- { pipe } split
- [ first|concatenation ] map first|alternation ;
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+ | RangeCharacter
-: handle-right-parenthesis ( -- )
- stack dup [ parentheses-group "members" word-prop member? ] find-last
- -rot cut rest
- [ [ push ] keep current-regexp get (>>stack) ]
- [ finish-regexp-parse push-stack ] bi* ;
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
+ | AnyRangeCharacter
-: parse-regexp-token ( token -- ? )
- {
- { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
- { CHAR: ) [ handle-right-parenthesis f ] }
- { CHAR: . [ handle-dot t ] }
- { CHAR: | [ handle-pipe t ] }
- { CHAR: ? [ handle-question t ] }
- { CHAR: * [ handle-star t ] }
- { CHAR: + [ handle-plus t ] }
- { CHAR: { [ handle-left-brace t ] }
- { CHAR: [ [ handle-left-bracket t ] }
- { CHAR: \ [ handle-escape t ] }
- [
- dup CHAR: $ = peek1 f = and
- [ drop handle-back-anchor f ]
- [ push-constant t ] if
- ]
- } case ;
+Ranges = StartRange:s Range*:r => [[ r s prefix ]]
+
+CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+
+Options = [idmsux]*
+
+Parenthized = "?:" Alternation:a => [[ a ]]
+ | "?" Options:on "-"? Options:off ":" Alternation:a
+ => [[ a on off parse-options <with-options> ]]
+ | "?#" [^)]* => [[ f ]]
+ | "?~" Alternation:a => [[ a <negation> ]]
+ | "?=" Alternation:a => [[ a <lookahead> ]]
+ | "?!" Alternation:a => [[ a <negation> <lookahead> ]]
+ | "?<=" Alternation:a => [[ a <lookbehind> ]]
+ | "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
+ | Alternation
+
+Element = "(" Parenthized:p ")" => [[ p ]]
+ | "[" CharClass:r "]" => [[ r ]]
+ | ".":d => [[ any-char <primitive-class> ]]
+ | Character
+
+Number = (!(","|"}").)* => [[ string>number ensure-number ]]
+
+Times = "," Number:n "}" => [[ 0 n <from-to> ]]
+ | Number:n ",}" => [[ n <at-least> ]]
+ | Number:n "}" => [[ n n <from-to> ]]
+ | "}" => [[ bad-number ]]
+ | Number:n "," Number:m "}" => [[ n m <from-to> ]]
+
+Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+ | Element:e "?" => [[ e <maybe> ]]
+ | Element:e "*" => [[ e <star> ]]
+ | Element:e "+" => [[ e <plus> ]]
+ | Element
+
+Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
+
+Alternation = Concatenation:c ("|" Concatenation)*:a
+ => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
+
+End = !(.)
-: (parse-regexp) ( -- )
- read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
-
-: parse-regexp-beginning ( -- )
- peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
-
-: parse-regexp ( regexp -- )
- dup current-regexp [
- raw>> [
- <string-reader> [
- parse-regexp-beginning (parse-regexp)
- ] with-input-stream
- ] unless-empty
- current-regexp get [ finish-regexp-parse ] change-stack
- dup stack>> >>parse-tree drop
- ] with-variable ;
+Main = Alternation End
+;EBNF
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax regexp.backend ;
+USING: kernel strings help.markup help.syntax ;
IN: regexp
HELP: <regexp>
! Dotall mode -- when on, . matches newlines.
! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test
-[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
+! [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test
-[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
+! [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test
[ t ] [ "a" ".+" <regexp> matches? ] unit-test
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
-/*
-! FIXME
[ f ] [ "" "(a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
-*/
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
[ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
-/*
-! FIXME
[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
-*/
[ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
[ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
-[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
-[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
+/*
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
+*/
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test
[ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
[ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
-/*
-! FIXME
[ ] [
"(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
<regexp> drop
[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
-*/
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
/*
-! FIXME
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
-! Convert to lowercase until E
-[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
-[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
+! Testing negation
+[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a)/ matches? ] unit-test
-! Convert to uppercase until E
-[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
-[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
+[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
-! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+! Intersecting classes
+[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
-! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry regexp.backend regexp.utils
+namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation ;
IN: regexp
-: default-regexp ( string -- regexp )
- regexp new
- swap >>raw
- <transition-table> >>nfa-table
- <transition-table> >>dfa-table
- <transition-table> >>minimized-table
- H{ } clone >>nfa-traversal-flags
- H{ } clone >>dfa-traversal-flags
- H{ } clone >>options
- H{ } clone >>matchers
- reset-regexp ;
-
-: construct-regexp ( regexp -- regexp' )
- {
- [ parse-regexp ]
- [ construct-nfa ]
- [ construct-dfa ]
- [ ]
- } cleave ;
+TUPLE: regexp raw parse-tree options dfa ;
+
+: <optioned-regexp> ( string options -- regexp )
+ [ dup parse-regexp ] [ string>options ] bi*
+ 2dup <with-options> ast>dfa
+ regexp boa ;
+
+: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+
+<PRIVATE
: (match) ( string regexp -- dfa-traverser )
- <dfa-traverser> do-match ; inline
+ dfa>> <dfa-traverser> do-match ; inline
+
+PRIVATE>
: match ( string regexp -- slice/f )
(match) return-match ;
-: match* ( string regexp -- slice/f captured-groups )
- (match) [ return-match ] [ captured-groups>> ] bi ;
-
: matches? ( string regexp -- ? )
dupd match
[ [ length ] bi@ = ] [ drop f ] if* ;
dupd first-match
[ split1-slice swap ] [ "" like f swap ] if* ;
+<PRIVATE
+
: (re-split) ( string regexp -- )
over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
+PRIVATE>
+
: re-split ( string regexp -- seq )
[ (re-split) ] { } make ;
{ "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ;
-: string>options ( string -- options )
- [ ch>option dup ] H{ } map>assoc ;
-
-: options>string ( options -- string )
- keys [ option>ch ] map natural-sort >string ;
-
-PRIVATE>
-
-: <optioned-regexp> ( string option-string -- regexp )
- [ default-regexp ] [ string>options ] bi* >>options
- construct-regexp ;
-
-: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
-
-<PRIVATE
-
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences
-vectors regexp.utils ;
+vectors ;
IN: regexp.transition-tables
TUPLE: transition from to obj ;
#! set the state as a key
2dup [ to>> ] dip maybe-initialize-key
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
- 2dup at* [ 2nip insert-at ]
- [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
+ 2dup at* [ 2nip push-at ]
+ [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ;
: add-transition ( transition transition-table -- )
transitions>> set-transition ;
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math
-quotations sequences regexp.parser regexp.classes fry arrays
-combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
+quotations sequences regexp.classes fry arrays
+combinators.short-circuit prettyprint regexp.nfa ;
IN: regexp.traversal
TUPLE: dfa-traverser
dfa-table
- traversal-flags
- traverse-forward
- lookahead-counters
- lookbehind-counters
- capture-counters
- captured-groups
- capture-group-index
- last-state current-state
+ current-state
text
match-failed?
start-index current-index
matches ;
-: <dfa-traverser> ( text regexp -- match )
- [ dfa-table>> ] [ dfa-traversal-flags>> ] bi
+: <dfa-traverser> ( text dfa -- match )
dfa-traverser new
- swap >>traversal-flags
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text
- t >>traverse-forward
0 >>start-index
0 >>current-index
- 0 >>capture-group-index
- V{ } clone >>matches
- V{ } clone >>capture-counters
- V{ } clone >>lookbehind-counters
- V{ } clone >>lookahead-counters
- H{ } clone >>captured-groups ;
+ V{ } clone >>matches ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ]
: text-finished? ( dfa-traverser -- ? )
{
- [ current-state>> empty? ]
+ [ current-state>> not ]
[ end-of-text? ]
[ match-failed?>> ]
} 1|| ;
dup save-final-state
] when text-finished? ;
+: text-character ( dfa-traverser n -- ch )
+ [ text>> ] swap '[ current-index>> _ + ] bi nth ;
+
: previous-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> 1- ] bi nth ;
+ -1 text-character ;
: current-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> ] bi nth ;
+ 0 text-character ;
: next-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> 1+ ] bi nth ;
-
-GENERIC: flag-action ( dfa-traverser flag -- )
-
-
-M: beginning-of-input flag-action ( dfa-traverser flag -- )
- drop
- dup beginning-of-text? [ t >>match-failed? ] unless drop ;
-
-M: end-of-input flag-action ( dfa-traverser flag -- )
- drop
- dup end-of-text? [ t >>match-failed? ] unless drop ;
-
-
-M: beginning-of-line flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ beginning-of-text? ]
- [ previous-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-M: end-of-line flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ end-of-text? ]
- [ next-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: word-boundary flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ end-of-text? ]
- [ current-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: lookahead-on flag-action ( dfa-traverser flag -- )
- drop
- lookahead-counters>> 0 swap push ;
-
-M: lookahead-off flag-action ( dfa-traverser flag -- )
- drop
- dup lookahead-counters>>
- [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
-
-M: lookbehind-on flag-action ( dfa-traverser flag -- )
- drop
- f >>traverse-forward
- [ 2 - ] change-current-index
- lookbehind-counters>> 0 swap push ;
-
-M: lookbehind-off flag-action ( dfa-traverser flag -- )
- drop
- t >>traverse-forward
- dup lookbehind-counters>>
- [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
-
-M: capture-group-on flag-action ( dfa-traverser flag -- )
- drop
- [ current-index>> 0 2array ]
- [ capture-counters>> ] bi push ;
-
-M: capture-group-off flag-action ( dfa-traverser flag -- )
- drop
- dup capture-counters>> empty? [
- drop
- ] [
- {
- [ capture-counters>> pop first2 dupd + ]
- [ text>> <slice> ]
- [ [ 1+ ] change-capture-group-index capture-group-index>> ]
- [ captured-groups>> set-at ]
- } cleave
- ] if ;
-
-: process-flags ( dfa-traverser -- )
- [ [ 1+ ] map ] change-lookahead-counters
- [ [ 1+ ] map ] change-lookbehind-counters
- [ [ first2 1+ 2array ] map ] change-capture-counters
- ! dup current-state>> .
- dup [ current-state>> ] [ traversal-flags>> ] bi
- at [ flag-action ] with each ;
+ 1 text-character ;
: increment-state ( dfa-traverser state -- dfa-traverser )
- [
- dup traverse-forward>>
- [ [ 1+ ] change-current-index ]
- [ [ 1- ] change-current-index ] if
- dup current-state>> >>last-state
- ] [ first ] bi* >>current-state ;
+ [ [ 1 + ] change-current-index ] dip >>current-state ;
: match-literal ( transition from-state table -- to-state/f )
transitions>> at at ;
: match-class ( transition from-state table -- to-state/f )
transitions>> at* [
- [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if
+ swap '[ drop _ swap class-member? ] assoc-find spin ?
] [ drop ] if ;
: match-default ( transition from-state table -- to-state/f )
[ dfa-table>> ] tri ;
: do-match ( dfa-traverser -- dfa-traverser )
- dup process-flags
dup match-done? [
dup setup-match match-transition
[ increment-state do-match ] when*
+++ /dev/null
-USING: regexp.utils tools.test ;
-IN: regexp.utils.tests
-
-[ [ ] [ ] while-changes ] must-infer
+++ /dev/null
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs io kernel math math.order
-namespaces regexp.backend sequences unicode.categories
-math.ranges fry combinators.short-circuit vectors ;
-IN: regexp.utils
-
-: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
- [ [ dup slip ] dip pick over call ] dip dupd =
- [ 3drop ] [ (while-changes) ] if ; inline recursive
-
-: while-changes ( obj quot pred -- obj' )
- pick over call (while-changes) ; inline
-
-: assoc-with ( param assoc quot -- assoc curry )
- swapd [ [ -rot ] dip call ] 2curry ; inline
-
-: insert-at ( value key hash -- )
- 2dup at* [
- 2nip push
- ] [
- drop
- [ dup vector? [ 1vector ] unless ] 2dip set-at
- ] if ;
-
-: ?insert-at ( value key hash/f -- hash )
- [ H{ } clone ] unless* [ insert-at ] keep ;
-
-ERROR: bad-octal number ;
-ERROR: bad-hex number ;
-: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
-: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
-
-: ascii? ( n -- ? ) 0 HEX: 7f between? ;
-: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
-: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
-
-: hex-digit? ( n -- ? )
- [
- [ decimal-digit? ]
- [ CHAR: a CHAR: f between? ]
- [ CHAR: A CHAR: F between? ]
- ] 1|| ;
-
-: control-char? ( n -- ? )
- [
- [ 0 HEX: 1f between? ]
- [ HEX: 7f = ]
- ] 1|| ;
-
-: punct? ( n -- ? )
- "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
- [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
-
-: java-blank? ( n -- ? )
- {
- CHAR: \s CHAR: \t CHAR: \n
- HEX: b HEX: 7 CHAR: \r
- } member? ;
-
-: java-printable? ( n -- ? )
- [ [ alpha? ] [ punct? ] ] 1|| ;
USING: xmode.loader xmode.utilities xmode.rules namespaces
strings splitting assocs sequences kernel io.files xml memoize
-words globs combinators io.encodings.utf8 sorting accessors xml.data ;
+words globs combinators io.encodings.utf8 sorting accessors xml.data
+xml.traversal xml.syntax ;
IN: xmode.catalog
TUPLE: mode file file-name-glob first-line-glob ;
-<TAGS: parse-mode-tag ( modes tag -- )
+TAGS: parse-mode-tag ( modes tag -- )
-TAG: MODE
+TAG: MODE parse-mode-tag
dup "NAME" attr [
mode new {
{ "FILE" f (>>file) }
] dip
rot set-at ;
-TAGS>
-
: parse-modes-tag ( tag -- modes )
H{ } clone [
- swap child-tags [ parse-mode-tag ] with each
+ swap children-tags [ parse-mode-tag ] with each
] keep ;
MEMO: modes ( -- modes )
] if ;
: finalize-mode ( rulesets -- )
- rule-sets [
- dup [ nip finalize-rule-set ] assoc-each
+ dup rule-sets [
+ [ nip finalize-rule-set ] assoc-each
] with-variable ;
: load-mode ( name -- rule-sets )
USING: xmode.loader.syntax xmode.tokens xmode.rules
xmode.keyword-map xml.data xml.traversal xml assocs kernel
combinators sequences math.parser namespaces parser
-xmode.utilities parser-combinators.regexp io.files accessors ;
+xmode.utilities regexp io.files accessors xml.syntax ;
IN: xmode.loader
! Based on org.gjt.sp.jedit.XModeHandler
! RULES and its children
-<TAGS: parse-rule-tag ( rule-set tag -- )
+TAGS: parse-rule-tag ( rule-set tag -- )
-TAG: PROPS
+TAG: PROPS parse-rule-tag
parse-props-tag >>props drop ;
-TAG: IMPORT
+TAG: IMPORT parse-rule-tag
"DELEGATE" attr swap import-rule-set ;
-TAG: TERMINATE
+TAG: TERMINATE parse-rule-tag
"AT_CHAR" attr string>number >>terminate-char drop ;
-RULE: SEQ seq-rule
+RULE: SEQ seq-rule parse-rule-tag
shared-tag-attrs delegate-attr literal-start ;
-RULE: SEQ_REGEXP seq-rule
+RULE: SEQ_REGEXP seq-rule parse-rule-tag
shared-tag-attrs delegate-attr regexp-attr regexp-start ;
-RULE: SPAN span-rule
+RULE: SPAN span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ;
-RULE: SPAN_REGEXP span-rule
+RULE: SPAN_REGEXP span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ;
-RULE: EOL_SPAN eol-span-rule
+RULE: EOL_SPAN eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ;
-RULE: EOL_SPAN_REGEXP eol-span-rule
+RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ;
-RULE: MARK_FOLLOWING mark-following-rule
+RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ;
-RULE: MARK_PREVIOUS mark-previous-rule
+RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ;
-TAG: KEYWORDS ( rule-set tag -- key value )
+TAG: KEYWORDS parse-rule-tag
rule-set get ignore-case?>> <keyword-map>
- swap child-tags [ over parse-keyword-tag ] each
+ swap children-tags [ over parse-keyword-tag ] each
swap (>>keywords) ;
-TAGS>
-
: ?<regexp> ( string/f -- regexp/f )
- dup [ rule-set get ignore-case?>> <regexp> ] when ;
+ dup [ rule-set get ignore-case?>> <?insensitive-regexp> ] when ;
: (parse-rules-tag) ( tag -- rule-set )
<rule-set> dup rule-set set
: parse-rules-tag ( tag -- rule-set )
[
- [ (parse-rules-tag) ] [ child-tags ] bi
+ [ (parse-rules-tag) ] [ children-tags ] bi
[ parse-rule-tag ] with each
rule-set get
] with-scope ;
USING: accessors xmode.tokens xmode.rules xmode.keyword-map
xml.data xml.traversal xml assocs kernel combinators sequences
math.parser namespaces make parser lexer xmode.utilities
-parser-combinators.regexp io.files splitting arrays ;
+regexp io.files splitting arrays xml.syntax xml.syntax.private ;
IN: xmode.loader.syntax
! Rule tag parsing utilities
new swap init-from-tag swap add-rule ; inline
: RULE:
- scan scan-word
- parse-definition { } make
- swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing
+ scan scan-word scan-word [
+ parse-definition { } make
+ swap [ (parse-rule-tag) ] 2curry
+ ] dip swap define-tag ; parsing
! Attribute utilities
: string>boolean ( string -- ? ) "TRUE" = ;
[ "NAME" attr ] [ "VALUE" attr ] bi ;
: parse-props-tag ( tag -- assoc )
- child-tags
+ children-tags
[ parse-prop-tag ] H{ } map>assoc ;
: position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? )
swap position-attrs <matcher> ;
: parse-regexp-matcher ( tag -- matcher )
- dup children>string rule-set get ignore-case?>> <regexp>
+ dup children>string
+ rule-set get ignore-case?>> <?insensitive-regexp>
swap position-attrs <matcher> ;
: shared-tag-attrs ( -- )
[ parse-literal-matcher >>end drop ] , ;
! SPAN's children
-<TAGS: parse-begin/end-tag ( rule tag -- )
+TAGS: parse-begin/end-tag ( rule tag -- )
-TAG: BEGIN
+TAG: BEGIN parse-begin/end-tag
! XXX
parse-literal-matcher >>start drop ;
-TAG: END
+TAG: END parse-begin/end-tag
! XXX
parse-literal-matcher >>end drop ;
-TAGS>
-
: parse-begin/end-tags ( -- )
[
! XXX: handle position attrs on span tag itself
- child-tags [ parse-begin/end-tag ] with each
+ children-tags [ parse-begin/end-tag ] with each
] , ;
: init-span-tag ( -- ) [ drop init-span ] , ;
USING: kernel namespaces make xmode.rules xmode.tokens
xmode.marker.state xmode.marker.context xmode.utilities
xmode.catalog sequences math assocs combinators strings
-parser-combinators.regexp splitting parser-combinators ascii
+regexp splitting ascii regexp.backend unicode.case
ascii combinators.short-circuit accessors ;
+! regexp.backend is for the regexp class
+
+! Next two words copied from parser-combinators
+! Just like head?, but they optionally ignore case
+
+: string= ( str1 str2 ignore-case -- ? )
+ [ [ >upper ] bi@ ] when sequence= ;
+
+: string-head? ( str1 str2 ignore-case -- ? )
+ 2over shorter?
+ [ 3drop f ] [
+ [
+ [ nip ]
+ [ length head-slice ] 2bi
+ ] dip string=
+ ] if ;
! Based on org.gjt.sp.jedit.syntax.TokenMarker
process-escape? get [
escaped? [ not ] change
position [ + ] change
- ] [ 2drop ] if ;
+ ] [ drop ] if ;
M: seq-rule handle-rule-start
?end-rule
USING: accessors xmode.tokens xmode.keyword-map kernel
sequences vectors assocs strings memoize unicode.case
-parser-combinators.regexp ;
+regexp regexp.backend ; ! regexp.backend has the regexp class
IN: xmode.rules
TUPLE: string-matcher string ignore-case? ;
+USING: assocs xmode.utilities tools.test ;
IN: xmode.utilities.tests
-USING: accessors xmode.utilities tools.test xml xml.data kernel
-strings vectors sequences io.files prettyprint assocs
-unicode.case ;
+
[ "hi" 3 ] [
{ 1 2 3 4 5 6 7 8 } [ H{ { 3 "hi" } } at ] map-find
] unit-test
[ f f ] [
{ 1 2 3 4 5 6 7 8 } [ H{ { 11 "hi" } } at ] map-find
] unit-test
-
-TUPLE: company employees type ;
-
-: <company> V{ } clone f company boa ;
-
-: add-employee employees>> push ;
-
-<TAGS: parse-employee-tag
-
-TUPLE: employee name description ;
-
-TAG: employee
- employee new
- { { "name" f (>>name) } { f (>>description) } }
- init-from-tag swap add-employee ;
-
-TAGS>
-
-\ parse-employee-tag see
-
-: parse-company-tag
- [
- <company>
- { { "type" >upper (>>type) } }
- init-from-tag dup
- ] keep
- children>> [ tag? ] filter
- [ parse-employee-tag ] with each ;
-
-[
- T{ company f
- V{
- T{ employee f "Joe" "VP Sales" }
- T{ employee f "Jane" "CFO" }
- }
- "PUBLIC"
- }
-] [
- "vocab:xmode/utilities/test.xml"
- file>xml parse-company-tag
-] unit-test
USING: accessors sequences assocs kernel quotations namespaces
-xml.data xml.traversal combinators macros parser lexer words fry ;
+xml.data xml.traversal combinators macros parser lexer words fry
+regexp ;
IN: xmode.utilities
: implies ( x y -- z ) [ not ] dip or ; inline
-: child-tags ( tag -- seq ) children>> [ tag? ] filter ;
-
: map-find ( seq quot -- result elt )
[ f ] 2dip
'[ nip @ dup ] find
: init-from-tag ( tag tuple specs -- tuple )
over [ (init-from-tag) ] dip ; inline
-SYMBOL: tag-handlers
-SYMBOL: tag-handler-word
-
-: <TAGS:
- CREATE tag-handler-word set
- H{ } clone tag-handlers set ; parsing
-
-: (TAG:) ( name quot -- ) swap tag-handlers get set-at ;
-
-: TAG:
- scan parse-definition
- (TAG:) ; parsing
-
-: TAGS>
- tag-handler-word get
- tag-handlers get >alist [ [ dup main>> ] dip case ] curry
- define ; parsing
+: <?insensitive-regexp> ( string ? -- regexp )
+ "i" "" ? <optioned-regexp> ;
: substituter ( assoc -- quot )
[ ?at drop ] curry ; inline
-: with-assoc ( assoc quot: ( value key -- assoc ) -- quot: ( key value -- ) )
+: with-assoc ( assoc quot: ( value key assoc -- ) -- quot: ( key value -- ) )
curry [ swap ] prepose ; inline
PRIVATE>