TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation )
- epsilon [ concatenation boa ] reduce ;
+ [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
TUPLE: alternation first second ;
: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
+
+TUPLE: lookahead term ;
+C: <lookahead> lookahead
+
+TUPLE: lookbehind term ;
+C: <lookbehind> lookbehind
ascii unicode.categories combinators.short-circuit sequences ;
IN: regexp.classes
-: punct? ( ch -- ? )
- "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
- { [ alpha? ] [ CHAR: _ = ] } 1|| ;
-
-: java-blank? ( ch -- ? )
- {
- CHAR: \s CHAR: \t CHAR: \n
- HEX: b HEX: 7 CHAR: \r
- } member? ;
-
-: java-printable? ( ch -- ? )
- [ [ alpha? ] [ punct? ] ] 1|| ;
-
-: hex-digit? ( ch -- ? )
- {
- [ CHAR: A CHAR: F between? ]
- [ CHAR: a CHAR: f between? ]
- [ CHAR: 0 CHAR: 9 between? ]
- } 1|| ;
-
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
M: digit-class class-member? ( obj class -- ? )
drop digit? ;
+: c-identifier-char? ( ch -- ? )
+ { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
M: c-identifier-class class-member? ( obj class -- ? )
- drop
- { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
+ drop c-identifier-char? ;
M: alpha-class class-member? ( obj class -- ? )
drop alpha? ;
+: punct? ( ch -- ? )
+ "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
M: punctuation-class class-member? ( obj class -- ? )
drop punct? ;
+: java-printable? ( ch -- ? )
+ { [ alpha? ] [ punct? ] } 1|| ;
+
M: java-printable-class class-member? ( obj class -- ? )
drop java-printable? ;
M: control-character-class class-member? ( obj class -- ? )
drop control? ;
+: hex-digit? ( ch -- ? )
+ {
+ [ CHAR: A CHAR: F between? ]
+ [ CHAR: a CHAR: f between? ]
+ [ CHAR: 0 CHAR: 9 between? ]
+ } 1|| ;
+
M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ;
+: java-blank? ( ch -- ? )
+ {
+ CHAR: \s CHAR: \t CHAR: \n
+ HEX: b HEX: 7 CHAR: \r
+ } member? ;
+
M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ;
2drop f ;
M: terminator-class class-member? ( obj class -- ? )
- drop {
- [ CHAR: \r = ]
- [ CHAR: \n = ]
- [ CHAR: \u000085 = ]
- [ CHAR: \u002028 = ]
- [ CHAR: \u002029 = ]
- } 1|| ;
+ drop "\r\n\u000085\u002029\u002028" member? ;
M: beginning-of-line class-member? ( obj class -- ? )
2drop f ;
TUPLE: not-class class ;
C: <not-class> not-class
+: <and-class> ( classes -- class )
+ [ <not-class> ] map <or-class> <not-class> ;
+
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
M: primitive-class class-member?
class>> class-member? ;
+
+UNION: class primitive-class not-class or-class range ;
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors sequences.deep ;
+sets sorting vectors sequences.deep math.functions regexp.classes ;
USING: io prettyprint threads ;
IN: regexp.dfa
: while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline
+TUPLE: parts in out ;
+
+: make-partition ( choices classes -- partition )
+ zip [ first ] partition parts boa ;
+
+: powerset-partition ( classes -- partitions )
+ ! Here is where class algebra will happen, when I implement it
+ [ length [ 2^ ] keep ] keep '[
+ _ [ ] map-bits _ make-partition
+ ] map ;
+
+: partition>class ( parts -- class )
+ [ in>> ] [ out>> ] bi
+ [ <or-class> ] bi@ <not-class> 2array <and-class> ;
+
+: get-transitions ( partition state-transitions -- next-states )
+ [ in>> ] dip '[ at ] gather ;
+
+: disambiguate-overlap ( nfa -- nfa' )
+ [
+ [
+ [ keys powerset-partition ] keep '[
+ [ partition>class ]
+ [ _ get-transitions ] bi
+ ] H{ } map>assoc
+ ] assoc-map
+ ] change-transitions ;
+
: find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ;
swap find-start-state >>start-state ;
: construct-dfa ( nfa -- dfa )
+ disambiguate-overlap
dup initialize-dfa
dup start-state>> 1vector
H{ } clone
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets hashtables
+regexp.transition-tables words sets hashtables combinators.short-circuit
unicode.case.private regexp.ast regexp.classes ;
+IN: regexp.nfa
+
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
-IN: regexp.nfa
-SYMBOL: negated?
+GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
+! This is unfinished and does nothing right now!
+
+M: object remove-lookahead ;
+
+M: with-options remove-lookahead
+ [ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
-: negate ( -- )
- negated? [ not ] change ;
+M: alternation remove-lookahead
+ [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
+
+M: concatenation remove-lookahead ;
SINGLETON: eps
GENERIC: nfa-node ( node -- start-state end-state )
-:: add-simple-entry ( obj class -- start-state end-state )
- next-state :> s0
- next-state :> s1
- negated? get [
- s0 f obj class make-transition table add-transition
- s0 s1 <default-transition> table add-transition
- ] [
- s0 s1 obj class make-transition table add-transition
- ] if
- s0 s1 ;
+: add-simple-entry ( obj class -- start-state end-state )
+ [ next-state next-state 2dup ] 2dip
+ make-transition table add-transition ;
: epsilon-transition ( source target -- )
eps <literal-transition> table add-transition ;
[ nfa-node ] bi@
alternate-nodes ;
-M: integer nfa-node ( node -- start end )
+GENERIC: modify-class ( char-class -- char-class' )
+
+M: object modify-class ;
+
+M: integer modify-class
case-insensitive option? [
- dup [ ch>lower ] [ ch>upper ] bi
- 2dup = [
- 2drop
- literal-transition add-simple-entry
- ] [
- [ literal-transition add-simple-entry ] bi@
- alternate-nodes [ nip ] dip
- ] if
- ] [ literal-transition add-simple-entry ] if ;
-
-M: primitive-class nfa-node ( node -- start end )
- class>> dup
- { letter-class LETTER-class } member? case-insensitive option? and
- [ drop Letter-class ] when
- class-transition add-simple-entry ;
-
-M: or-class nfa-node class-transition add-simple-entry ;
-M: not-class nfa-node class-transition add-simple-entry ;
-
-M: any-char nfa-node ( node -- start end )
- [ dotall option? ] dip any-char-no-nl ?
- class-transition add-simple-entry ;
-
-! M: negation nfa-node ( node -- start end )
-! negate term>> nfa-node negate ;
-
-M: range nfa-node ( node -- start end )
+ dup Letter? [
+ [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+ ] when
+ ] when ;
+
+M: integer nfa-node ( node -- start end )
+ modify-class dup class?
+ class-transition literal-transition ?
+ add-simple-entry ;
+
+M: primitive-class modify-class
+ class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+ seq>> [ modify-class ] map <or-class> ;
+
+M: not-class modify-class
+ class>> modify-class <not-class> ;
+
+M: any-char modify-class
+ [ dotall option? ] dip any-char-no-nl ? ;
+
+: modify-letter-class ( class -- newclass )
+ case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
+
+: cased-range? ( range -- ? )
+ [ from>> ] [ to>> ] bi {
+ [ [ letter? ] bi@ and ]
+ [ [ LETTER? ] bi@ and ]
+ } 2|| ;
+
+M: range modify-class
case-insensitive option? [
- ! This should be implemented for Unicode by case-folding
- ! the input and all strings in the regexp.
- dup [ from>> ] [ to>> ] bi
- 2dup [ Letter? ] bi@ and [
- rot drop
+ dup cased-range? [
+ [ from>> ] [ to>> ] bi
[ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ <range> ] 2bi
- [ class-transition add-simple-entry ] bi@
- alternate-nodes
- ] [
- 2drop
- class-transition add-simple-entry
- ] if
- ] [
- class-transition add-simple-entry
- ] if ;
+ 2array <or-class>
+ ] when
+ ] when ;
+
+M: class nfa-node
+ modify-class class-transition add-simple-entry ;
M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table )
[
- negated? off
0 state set
- <transition-table> clone nfa-table set
- nfa-node
+ <transition-table> nfa-table set
+ remove-lookahead nfa-node
table
swap dup associate >>final-states
swap >>start-state
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
+ | "?=" Alternation:a => [[ a <lookahead> ]]
+ | "?!" Alternation:a => [[ a <negation> <lookahead> ]]
+ | "?<=" Alternation:a => [[ a <lookbehind> ]]
+ | "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
- | ".":d => [[ any-char ]]
+ | ".":d => [[ any-char <primitive-class> ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
+! Testing negation
+[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a)/ matches? ] unit-test
+
+[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
+
+! Intersecting classes
+[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
+
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test