+++ /dev/null
-USING: pcre.ffi sequences splitting tools.test ;
-IN: pcre.ffi.tests
-
-[ 2 ] [ pcre_version " " split length ] unit-test
+++ /dev/null
-USING:
- alien alien.c-types alien.data alien.libraries alien.syntax
- classes.struct
- combinators
- system ;
-IN: pcre.ffi
-
-! http://sourceforge.net/projects/gnuwin32/files/pcre/7.0/pcre-7.0-bin.zip/download
-
-<< "pcre" {
- { [ os unix? ] [ "libpcre.so" ] }
- { [ os windows? ] [ "pcre3.dll" ] }
-} cond cdecl add-library >>
-
-LIBRARY: pcre
-
-CONSTANT: PCRE_CASELESS 0x00000001
-CONSTANT: PCRE_MULTILINE 0x00000002
-CONSTANT: PCRE_DOTALL 0x00000004
-CONSTANT: PCRE_EXTENDED 0x00000008
-CONSTANT: PCRE_ANCHORED 0x00000010
-CONSTANT: PCRE_DOLLAR_ENDONLY 0x00000020
-CONSTANT: PCRE_EXTRA 0x00000040
-CONSTANT: PCRE_NOTBOL 0x00000080
-CONSTANT: PCRE_NOTEOL 0x00000100
-CONSTANT: PCRE_UNGREEDY 0x00000200
-CONSTANT: PCRE_NOTEMPTY 0x00000400
-CONSTANT: PCRE_UTF8 0x00000800
-CONSTANT: PCRE_NO_AUTO_CAPTURE 0x00001000
-CONSTANT: PCRE_NO_UTF8_CHECK 0x00002000
-CONSTANT: PCRE_AUTO_CALLOUT 0x00004000
-CONSTANT: PCRE_PARTIAL_SOFT 0x00008000
-CONSTANT: PCRE_PARTIAL 0x00008000
-CONSTANT: PCRE_DFA_SHORTEST 0x00010000
-CONSTANT: PCRE_DFA_RESTART 0x00020000
-CONSTANT: PCRE_FIRSTLINE 0x00040000
-CONSTANT: PCRE_DUPNAMES 0x00080000
-CONSTANT: PCRE_NEWLINE_CR 0x00100000
-CONSTANT: PCRE_NEWLINE_LF 0x00200000
-CONSTANT: PCRE_NEWLINE_CRLF 0x00300000
-CONSTANT: PCRE_NEWLINE_ANY 0x00400000
-CONSTANT: PCRE_NEWLINE_ANYCRLF 0x00500000
-CONSTANT: PCRE_BSR_ANYCRLF 0x00800000
-CONSTANT: PCRE_BSR_UNICODE 0x01000000
-CONSTANT: PCRE_JAVASCRIPT_COMPAT 0x02000000
-CONSTANT: PCRE_NO_START_OPTIMIZE 0x04000000
-CONSTANT: PCRE_NO_START_OPTIMISE 0x04000000
-CONSTANT: PCRE_PARTIAL_HARD 0x08000000
-CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
-CONSTANT: PCRE_UCP 0x20000000
-
-ENUM: PCRE_ERRORS
- { PCRE_ERROR_NOMATCH -1 }
- { PCRE_ERROR_NULL -2 }
- { PCRE_ERROR_BADOPTION -3 }
- { PCRE_ERROR_BADMAGIC -4 }
- { PCRE_ERROR_UNKNOWN_OPCODE -5 }
- { PCRE_ERROR_UNKNOWN_NODE -5 }
- { PCRE_ERROR_NOMEMORY -6 }
- { PCRE_ERROR_NOSUBSTRING -7 }
- { PCRE_ERROR_MATCHLIMIT -8 }
- { PCRE_ERROR_CALLOUT -9 }
- { PCRE_ERROR_BADUTF8 -10 }
- { PCRE_ERROR_BADUTF8_OFFSET -11 }
- { PCRE_ERROR_PARTIAL -12 }
- { PCRE_ERROR_BADPARTIAL -13 }
- { PCRE_ERROR_INTERNAL -14 }
- { PCRE_ERROR_BADCOUNT -15 }
- { PCRE_ERROR_DFA_UITEM -16 }
- { PCRE_ERROR_DFA_UCOND -17 }
- { PCRE_ERROR_DFA_UMLIMIT -18 }
- { PCRE_ERROR_DFA_WSSIZE -19 }
- { PCRE_ERROR_DFA_RECURSE -20 }
- { PCRE_ERROR_RECURSIONLIMIT -21 }
- { PCRE_ERROR_NULLWSLIMIT -22 }
- { PCRE_ERROR_BADNEWLINE -23 }
- { PCRE_ERROR_BADOFFSET -24 }
- { PCRE_ERROR_SHORTUTF8 -25 } ;
-
-CONSTANT: PCRE_ERROR_NOMATCH -1
-CONSTANT: PCRE_ERROR_NULL -2
-CONSTANT: PCRE_ERROR_BADOPTION -3
-CONSTANT: PCRE_ERROR_BADMAGIC -4
-CONSTANT: PCRE_ERROR_UNKNOWN_OPCODE -5
-CONSTANT: PCRE_ERROR_UNKNOWN_NODE -5
-CONSTANT: PCRE_ERROR_NOMEMORY -6
-CONSTANT: PCRE_ERROR_NOSUBSTRING -7
-CONSTANT: PCRE_ERROR_MATCHLIMIT -8
-CONSTANT: PCRE_ERROR_CALLOUT -9
-CONSTANT: PCRE_ERROR_BADUTF8 -10
-CONSTANT: PCRE_ERROR_BADUTF8_OFFSET -11
-CONSTANT: PCRE_ERROR_PARTIAL -12
-CONSTANT: PCRE_ERROR_BADPARTIAL -13
-CONSTANT: PCRE_ERROR_INTERNAL -14
-CONSTANT: PCRE_ERROR_BADCOUNT -15
-CONSTANT: PCRE_ERROR_DFA_UITEM -16
-CONSTANT: PCRE_ERROR_DFA_UCOND -17
-CONSTANT: PCRE_ERROR_DFA_UMLIMIT -18
-CONSTANT: PCRE_ERROR_DFA_WSSIZE -19
-CONSTANT: PCRE_ERROR_DFA_RECURSE -20
-CONSTANT: PCRE_ERROR_RECURSIONLIMIT -21
-CONSTANT: PCRE_ERROR_NULLWSLIMIT -22
-CONSTANT: PCRE_ERROR_BADNEWLINE -23
-CONSTANT: PCRE_ERROR_BADOFFSET -24
-CONSTANT: PCRE_ERROR_SHORTUTF8 -25
-
-CONSTANT: PCRE_INFO_OPTIONS 0
-CONSTANT: PCRE_INFO_SIZE 1
-CONSTANT: PCRE_INFO_CAPTURECOUNT 2
-CONSTANT: PCRE_INFO_BACKREFMAX 3
-CONSTANT: PCRE_INFO_FIRSTBYTE 4
-CONSTANT: PCRE_INFO_FIRSTCHAR 4
-CONSTANT: PCRE_INFO_FIRSTTABLE 5
-CONSTANT: PCRE_INFO_LASTLITERAL 6
-CONSTANT: PCRE_INFO_NAMEENTRYSIZE 7
-CONSTANT: PCRE_INFO_NAMECOUNT 8
-CONSTANT: PCRE_INFO_NAMETABLE 9
-CONSTANT: PCRE_INFO_STUDYSIZE 10
-CONSTANT: PCRE_INFO_DEFAULT_TABLES 11
-CONSTANT: PCRE_INFO_OKPARTIAL 12
-CONSTANT: PCRE_INFO_JCHANGED 13
-CONSTANT: PCRE_INFO_HASCRORLF 14
-CONSTANT: PCRE_INFO_MINLENGTH 15
-
-CONSTANT: PCRE_CONFIG_UTF8 0
-CONSTANT: PCRE_CONFIG_NEWLINE 1
-CONSTANT: PCRE_CONFIG_LINK_SIZE 2
-CONSTANT: PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
-CONSTANT: PCRE_CONFIG_MATCH_LIMIT 4
-CONSTANT: PCRE_CONFIG_STACKRECURSE 5
-CONSTANT: PCRE_CONFIG_UNICODE_PROPERTIES 6
-CONSTANT: PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
-CONSTANT: PCRE_CONFIG_BSR 8
-
-
-STRUCT: pcre_extra
- { flags int }
- { study_data void* }
- { match_limit long }
- { callout_data void* }
- { tables uchar* }
- { match_limit_recursion int }
- { mark uchar** } ;
-
-FUNCTION: void pcre_config ( int what, void* where ) ;
-
-FUNCTION: void* pcre_compile ( c-string pattern,
- int options,
- char** errptr,
- int* erroffset,
- char* tableptr ) ;
-
-FUNCTION: void* pcre_compile2 ( c-string pattern,
- int options,
- int* errcodeptr,
- char** errptr,
- int* erroffset,
- char* tableptr ) ;
-
-FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ;
-FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ;
-
-FUNCTION: pcre_extra* pcre_study ( void* pcre, int options, char** errptr ) ;
-FUNCTION: int pcre_exec ( void* pcre,
- pcre_extra* extra,
- c-string subject,
- int length,
- int startoffset,
- int options,
- int* ovector,
- int ovecsize ) ;
-
-FUNCTION: int pcre_get_stringnumber ( void* pcre, c-string name ) ;
-
-FUNCTION: int pcre_get_substring ( c-string subject,
- int* ovector,
- int stringcount,
- int stringnumber,
- void *stringptr ) ;
-
-FUNCTION: int pcre_get_substring_list ( c-string subject,
- int* ovector,
- int stringcount,
- void *stringptr ) ;
-
-FUNCTION: c-string pcre_version ( ) ;
-
-FUNCTION: uchar* pcre_maketables ( ) ;
+++ /dev/null
-USING: accessors pcre pcre.info pcre.utils sequences tools.test ;
-
-[ { { 3 "day" } { 2 "month" } { 1 "year" } } ]
-[
- "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre>
- nametable>>
-] unit-test
-
-[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test
+++ /dev/null
-USING:
- accessors
- alien alien.accessors alien.c-types alien.data alien.strings
- arrays
- io.encodings.utf8
- kernel
- math
- pcre.ffi pcre.utils
- sequences ;
-IN: pcre.info
-
-! Mostly internal
-: fullinfo ( pcre extra what -- obj )
- { int } [ pcre_fullinfo ] with-out-parameters nip ;
-
-: name-count ( pcre extra -- n )
- PCRE_INFO_NAMECOUNT fullinfo ;
-
-: name-table ( pcre extra -- addr )
- PCRE_INFO_NAMETABLE fullinfo ;
-
-: name-entry-size ( pcre extra -- size )
- PCRE_INFO_NAMEENTRYSIZE fullinfo ;
-
-: name-table-entry ( addr -- group-index group-name )
- [ <alien> 1 alien-unsigned-1 ] [ 2 + <alien> utf8 alien>string ] bi ;
-
-: options ( pcre -- opts )
- f PCRE_INFO_OPTIONS fullinfo ;
-
-! Exported
-: name-table-entries ( pcre extra -- addrs )
- [ name-table ] [ name-entry-size ] [ name-count ] 2tri gen-array-addrs
- [ name-table-entry 2array ] map ;
+++ /dev/null
-USING:
- accessors
- arrays
- assocs
- http.client
- kernel
- math math.ranges
- pcre pcre.ffi pcre.info
- random
- sequences
- system
- tools.test ;
-QUALIFIED: regexp
-IN: pcre.tests
-
-CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})"
-
-! On windows the erroffset appears to be set to 0 despite there being
-! nothing wrong with the regexp.
-[ t ] [
- "foo" (pcre) 3array 1 tail { { f -1 } { f 0 } } member?
-] unit-test
-
-[ { 1 2 3 } ] [
- iso-date <pcre>
- { "year" "month" "day" } [ pcre_get_stringnumber ] with map
-] unit-test
-
-[ t ] [ "foo" <compiled-pcre> PCRE_UTF8 has-option? ] unit-test
-
-os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
-
-! In this day and age, not supporting utf-8 is broken.
-[ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test
-
-[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test
-
-! Tests for findall
-[
- { { f "1999-01-12" } { "year" "1999" } { "month" "01" } { "day" "12" } }
-] [
- "1999-01-12" iso-date <compiled-pcre> findall first
-] unit-test
-
-[ 3 ] [
- "2003-10-09 1999-09-01 1514-10-20" iso-date <compiled-pcre> findall length
-] unit-test
-
-[ 5 ] [ "abcdef" "[a-e]" findall length ] unit-test
-
-[ 3 ] [ "foo bar baz" "foo|bar|baz" findall length ] unit-test
-
-[ 3 ] [ "örjan är åtta" "[åäö]" findall length ] unit-test
-
-[ 3 ] [ "ÅÄÖ" "\\p{Lu}" findall length ] unit-test
-
-[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
-
-[ { { { f ", " } } { { f ", " } } { { f "." } } } ] [
- "Words, words, word." "\\W+" findall
-] unit-test
-
-[ { ", " ", " "." } ] [
- "Words, words, word." "\\W+" findall [ first second ] map
-] unit-test
-
-: long-string ( -- x )
- 10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ;
-
-! Performance
-[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test
-
-! Empty matches, corner case behaviour is copied from pcredemo.c
-[ { { { f "foo" } } { { f "" } } } ]
-[ "foo" ".*" findall ] unit-test
-
-[ { { { f "" } } { { f "" } } { { f "" } } } ]
-[ "foo" "B*" findall ] unit-test
-
-! Empty matches in strings with multi-byte characters are tricky.
-[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
-[ "öööö" "x*" findall ] unit-test
-
-! Tests for matches?
-[ t ] [ "örjan" "örjan" matches? ] unit-test
-
-[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test
-
-! Dotall mode, off by default
-[ f ] [ "." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
-[ t ] [ "(?s)." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
-
-[ f ] [ "\n" "." matches? ] unit-test
-[ t ] [ "\n" "(?s)." matches? ] unit-test
-
-! Caseless mode, off by default
-[ { f t } ] [
- { "x" "(?i)x" } [ <compiled-pcre> PCRE_CASELESS has-option? ] map
-] unit-test
-
-! Backreferences
-[ { t f } ] [
- { "response and responsibility" "sense and responsibility" }
- [ "(sens|respons)e and \\1ibility" matches? ] map
-] unit-test
-
-[ { t t f } ] [
- { "rah rah" "RAH RAH" "RAH rah" } [ "((?i)rah)\\s+\\1" matches? ] map
-] unit-test
-
-! Splitting
-[ { { "Words" "words" "word" } { "Words" "words" "word" } } ] [
- "Words, words, word." { "\\W+" "[,. ]" } [ split ] with map
-] unit-test
-
-! Bigger tests
-[ t ] [
- "http://factorcode.org/" http-get nip
- "href=\"(?P<link>[^\"]+)\"" findall [ "link" of ] map sequence?
-] unit-test
-
-! Test that the regexp syntax works.
-[ t ] [ "1234abcd" regexp:R[ ^\d+\w+$] matches? ] unit-test
+++ /dev/null
-USING:
- accessors
- alien.c-types alien.data alien.enums alien.strings
- arrays
- assocs
- io.encodings.utf8 io.encodings.string
- kernel
- math
- mirrors
- pcre.ffi pcre.info pcre.utils
- sequences sequences.generalizations
- strings ;
-QUALIFIED: regexp
-IN: pcre
-
-ERROR: malformed-regexp expr error ;
-ERROR: pcre-error value ;
-
-TUPLE: compiled-pcre pcre extra nametable ;
-
-: default-opts ( -- opts )
- PCRE_UTF8 PCRE_UCP bitor ;
-
-: (pcre) ( expr -- pcre err-message err-offset )
- default-opts { c-string int } [ f pcre_compile ] with-out-parameters ;
-
-: <pcre> ( expr -- pcre )
- dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
-
-: exec ( pcre extra subject ofs opts -- count match-data )
- [ dup length ] 2dip 30 int <c-array> 30 [ pcre_exec ] 2keep drop ;
-
-: <pcre-extra> ( pcre -- pcre-extra )
- 0 { c-string } [ pcre_study ] with-out-parameters drop ;
-
-: config ( what -- alien )
- { int } [ pcre_config ] with-out-parameters ;
-
-! Finding stuff
-TUPLE: matcher pcre extra subject ofs exec-opts match ;
-
-: <matcher> ( subject compiled-pcre -- matcher )
- [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ;
-
-! This handling of zero-length matches is taken from pcredemo.c
-: empty-match-opts ( -- opts )
- PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED bitor ;
-
-: findnext ( matcher -- matcher'/f )
- clone dup <mirror> values 6 firstn drop exec
- over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when
- -1 =
- [
- 2drop dup exec-opts>> 0 =
- [ drop f ]
- [
- dup [ subject>> ] [ ofs>> ] bi next-utf8-char
- [ >>ofs 0 >>exec-opts findnext ] [ drop f ] if*
- ] if
- ]
- [
- [ 2array >>match ]
- [
- nip
- [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ]
- [ second >>ofs ] bi
- ] 2bi
- ] if ;
-
-! Result parsing
-: substring-list ( subject match-array count -- alien )
- { void* } [ pcre_get_substring_list drop ] with-out-parameters ;
-
-: parse-match ( subject nametable match-data -- match )
- swapd first2 swap [ substring-list ] keep void* <c-direct-array>
- [ utf8 alien>string ] { } map-as [ of swap 2array ] with map-index ;
-
-! High-level
-: <compiled-pcre> ( expr -- compiled-pcre )
- <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
-
-: has-option? ( compiled-pcre option -- ? )
- [ pcre>> options ] dip bitand 0 > ;
-
-GENERIC: findall ( subject obj -- matches )
-
-M: compiled-pcre findall
- [ <matcher> [ findnext ] follow [ match>> ] map harvest ]
- [ nametable>> rot [ parse-match ] 2with map ] 2bi >array ;
-
-M: string findall
- <compiled-pcre> findall ;
-
-M: regexp:regexp findall
- raw>> findall ;
-
-: matches? ( subject obj -- ? )
- dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
-
-: split ( subject obj -- strings )
- dupd findall [ first second ] map split-subseqs ;
+++ /dev/null
-USING: pcre.utils tools.test ;
-IN: pcre.utils.tests
-
-[ { "Bords" "words" "word" } ] [
- "Bords, words, word." { ", " ", " "." } split-subseqs
-] unit-test
+++ /dev/null
-USING: assocs fry kernel math mirrors sequences splitting strings ;
-IN: pcre.utils
-
-: replace-all ( seq subseqs new -- seq )
- swapd '[ _ replace ] reduce ;
-
-: split-subseqs ( seq subseqs -- seqs )
- dup first [ replace-all ] keep split-subseq [ >string ] map harvest ;
-
-: 2with ( param1 param2 obj quot -- obj curry )
- [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
-
-: gen-array-addrs ( base size n -- addrs )
- iota [ * + ] 2with map ;
-
-: utf8-start-byte? ( byte -- ? )
- 0xc0 bitand 0x80 = not ;
-
-: next-utf8-char ( byte-array pos -- pos' )
- 1 + 2dup swap ?nth
- [ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ;
--- /dev/null
+USING: pcre.ffi sequences splitting tools.test ;
+IN: pcre.ffi.tests
+
+[ 2 ] [ pcre_version " " split length ] unit-test
--- /dev/null
+USING:
+ alien alien.c-types alien.data alien.libraries alien.syntax
+ classes.struct
+ combinators
+ system ;
+IN: pcre.ffi
+
+! http://sourceforge.net/projects/gnuwin32/files/pcre/7.0/pcre-7.0-bin.zip/download
+
+<< "pcre" {
+ { [ os unix? ] [ "libpcre.so" ] }
+ { [ os windows? ] [ "pcre3.dll" ] }
+} cond cdecl add-library >>
+
+LIBRARY: pcre
+
+CONSTANT: PCRE_CASELESS 0x00000001
+CONSTANT: PCRE_MULTILINE 0x00000002
+CONSTANT: PCRE_DOTALL 0x00000004
+CONSTANT: PCRE_EXTENDED 0x00000008
+CONSTANT: PCRE_ANCHORED 0x00000010
+CONSTANT: PCRE_DOLLAR_ENDONLY 0x00000020
+CONSTANT: PCRE_EXTRA 0x00000040
+CONSTANT: PCRE_NOTBOL 0x00000080
+CONSTANT: PCRE_NOTEOL 0x00000100
+CONSTANT: PCRE_UNGREEDY 0x00000200
+CONSTANT: PCRE_NOTEMPTY 0x00000400
+CONSTANT: PCRE_UTF8 0x00000800
+CONSTANT: PCRE_NO_AUTO_CAPTURE 0x00001000
+CONSTANT: PCRE_NO_UTF8_CHECK 0x00002000
+CONSTANT: PCRE_AUTO_CALLOUT 0x00004000
+CONSTANT: PCRE_PARTIAL_SOFT 0x00008000
+CONSTANT: PCRE_PARTIAL 0x00008000
+CONSTANT: PCRE_DFA_SHORTEST 0x00010000
+CONSTANT: PCRE_DFA_RESTART 0x00020000
+CONSTANT: PCRE_FIRSTLINE 0x00040000
+CONSTANT: PCRE_DUPNAMES 0x00080000
+CONSTANT: PCRE_NEWLINE_CR 0x00100000
+CONSTANT: PCRE_NEWLINE_LF 0x00200000
+CONSTANT: PCRE_NEWLINE_CRLF 0x00300000
+CONSTANT: PCRE_NEWLINE_ANY 0x00400000
+CONSTANT: PCRE_NEWLINE_ANYCRLF 0x00500000
+CONSTANT: PCRE_BSR_ANYCRLF 0x00800000
+CONSTANT: PCRE_BSR_UNICODE 0x01000000
+CONSTANT: PCRE_JAVASCRIPT_COMPAT 0x02000000
+CONSTANT: PCRE_NO_START_OPTIMIZE 0x04000000
+CONSTANT: PCRE_NO_START_OPTIMISE 0x04000000
+CONSTANT: PCRE_PARTIAL_HARD 0x08000000
+CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
+CONSTANT: PCRE_UCP 0x20000000
+
+ENUM: PCRE_ERRORS
+ { PCRE_ERROR_NOMATCH -1 }
+ { PCRE_ERROR_NULL -2 }
+ { PCRE_ERROR_BADOPTION -3 }
+ { PCRE_ERROR_BADMAGIC -4 }
+ { PCRE_ERROR_UNKNOWN_OPCODE -5 }
+ { PCRE_ERROR_UNKNOWN_NODE -5 }
+ { PCRE_ERROR_NOMEMORY -6 }
+ { PCRE_ERROR_NOSUBSTRING -7 }
+ { PCRE_ERROR_MATCHLIMIT -8 }
+ { PCRE_ERROR_CALLOUT -9 }
+ { PCRE_ERROR_BADUTF8 -10 }
+ { PCRE_ERROR_BADUTF8_OFFSET -11 }
+ { PCRE_ERROR_PARTIAL -12 }
+ { PCRE_ERROR_BADPARTIAL -13 }
+ { PCRE_ERROR_INTERNAL -14 }
+ { PCRE_ERROR_BADCOUNT -15 }
+ { PCRE_ERROR_DFA_UITEM -16 }
+ { PCRE_ERROR_DFA_UCOND -17 }
+ { PCRE_ERROR_DFA_UMLIMIT -18 }
+ { PCRE_ERROR_DFA_WSSIZE -19 }
+ { PCRE_ERROR_DFA_RECURSE -20 }
+ { PCRE_ERROR_RECURSIONLIMIT -21 }
+ { PCRE_ERROR_NULLWSLIMIT -22 }
+ { PCRE_ERROR_BADNEWLINE -23 }
+ { PCRE_ERROR_BADOFFSET -24 }
+ { PCRE_ERROR_SHORTUTF8 -25 } ;
+
+CONSTANT: PCRE_ERROR_NOMATCH -1
+CONSTANT: PCRE_ERROR_NULL -2
+CONSTANT: PCRE_ERROR_BADOPTION -3
+CONSTANT: PCRE_ERROR_BADMAGIC -4
+CONSTANT: PCRE_ERROR_UNKNOWN_OPCODE -5
+CONSTANT: PCRE_ERROR_UNKNOWN_NODE -5
+CONSTANT: PCRE_ERROR_NOMEMORY -6
+CONSTANT: PCRE_ERROR_NOSUBSTRING -7
+CONSTANT: PCRE_ERROR_MATCHLIMIT -8
+CONSTANT: PCRE_ERROR_CALLOUT -9
+CONSTANT: PCRE_ERROR_BADUTF8 -10
+CONSTANT: PCRE_ERROR_BADUTF8_OFFSET -11
+CONSTANT: PCRE_ERROR_PARTIAL -12
+CONSTANT: PCRE_ERROR_BADPARTIAL -13
+CONSTANT: PCRE_ERROR_INTERNAL -14
+CONSTANT: PCRE_ERROR_BADCOUNT -15
+CONSTANT: PCRE_ERROR_DFA_UITEM -16
+CONSTANT: PCRE_ERROR_DFA_UCOND -17
+CONSTANT: PCRE_ERROR_DFA_UMLIMIT -18
+CONSTANT: PCRE_ERROR_DFA_WSSIZE -19
+CONSTANT: PCRE_ERROR_DFA_RECURSE -20
+CONSTANT: PCRE_ERROR_RECURSIONLIMIT -21
+CONSTANT: PCRE_ERROR_NULLWSLIMIT -22
+CONSTANT: PCRE_ERROR_BADNEWLINE -23
+CONSTANT: PCRE_ERROR_BADOFFSET -24
+CONSTANT: PCRE_ERROR_SHORTUTF8 -25
+
+CONSTANT: PCRE_INFO_OPTIONS 0
+CONSTANT: PCRE_INFO_SIZE 1
+CONSTANT: PCRE_INFO_CAPTURECOUNT 2
+CONSTANT: PCRE_INFO_BACKREFMAX 3
+CONSTANT: PCRE_INFO_FIRSTBYTE 4
+CONSTANT: PCRE_INFO_FIRSTCHAR 4
+CONSTANT: PCRE_INFO_FIRSTTABLE 5
+CONSTANT: PCRE_INFO_LASTLITERAL 6
+CONSTANT: PCRE_INFO_NAMEENTRYSIZE 7
+CONSTANT: PCRE_INFO_NAMECOUNT 8
+CONSTANT: PCRE_INFO_NAMETABLE 9
+CONSTANT: PCRE_INFO_STUDYSIZE 10
+CONSTANT: PCRE_INFO_DEFAULT_TABLES 11
+CONSTANT: PCRE_INFO_OKPARTIAL 12
+CONSTANT: PCRE_INFO_JCHANGED 13
+CONSTANT: PCRE_INFO_HASCRORLF 14
+CONSTANT: PCRE_INFO_MINLENGTH 15
+
+CONSTANT: PCRE_CONFIG_UTF8 0
+CONSTANT: PCRE_CONFIG_NEWLINE 1
+CONSTANT: PCRE_CONFIG_LINK_SIZE 2
+CONSTANT: PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
+CONSTANT: PCRE_CONFIG_MATCH_LIMIT 4
+CONSTANT: PCRE_CONFIG_STACKRECURSE 5
+CONSTANT: PCRE_CONFIG_UNICODE_PROPERTIES 6
+CONSTANT: PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
+CONSTANT: PCRE_CONFIG_BSR 8
+
+
+STRUCT: pcre_extra
+ { flags int }
+ { study_data void* }
+ { match_limit long }
+ { callout_data void* }
+ { tables uchar* }
+ { match_limit_recursion int }
+ { mark uchar** } ;
+
+FUNCTION: void pcre_config ( int what, void* where ) ;
+
+FUNCTION: void* pcre_compile ( c-string pattern,
+ int options,
+ char** errptr,
+ int* erroffset,
+ char* tableptr ) ;
+
+FUNCTION: void* pcre_compile2 ( c-string pattern,
+ int options,
+ int* errcodeptr,
+ char** errptr,
+ int* erroffset,
+ char* tableptr ) ;
+
+FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ;
+FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ;
+
+FUNCTION: pcre_extra* pcre_study ( void* pcre, int options, char** errptr ) ;
+FUNCTION: int pcre_exec ( void* pcre,
+ pcre_extra* extra,
+ c-string subject,
+ int length,
+ int startoffset,
+ int options,
+ int* ovector,
+ int ovecsize ) ;
+
+FUNCTION: int pcre_get_stringnumber ( void* pcre, c-string name ) ;
+
+FUNCTION: int pcre_get_substring ( c-string subject,
+ int* ovector,
+ int stringcount,
+ int stringnumber,
+ void *stringptr ) ;
+
+FUNCTION: int pcre_get_substring_list ( c-string subject,
+ int* ovector,
+ int stringcount,
+ void *stringptr ) ;
+
+FUNCTION: c-string pcre_version ( ) ;
+
+FUNCTION: uchar* pcre_maketables ( ) ;
--- /dev/null
+USING: accessors pcre pcre.info pcre.utils sequences tools.test ;
+
+[ { { 3 "day" } { 2 "month" } { 1 "year" } } ]
+[
+ "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre>
+ nametable>>
+] unit-test
+
+[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test
--- /dev/null
+USING:
+ accessors
+ alien alien.accessors alien.c-types alien.data alien.strings
+ arrays
+ io.encodings.utf8
+ kernel
+ math
+ pcre.ffi pcre.utils
+ sequences ;
+IN: pcre.info
+
+! Mostly internal
+: fullinfo ( pcre extra what -- obj )
+ { int } [ pcre_fullinfo ] with-out-parameters nip ;
+
+: name-count ( pcre extra -- n )
+ PCRE_INFO_NAMECOUNT fullinfo ;
+
+: name-table ( pcre extra -- addr )
+ PCRE_INFO_NAMETABLE fullinfo ;
+
+: name-entry-size ( pcre extra -- size )
+ PCRE_INFO_NAMEENTRYSIZE fullinfo ;
+
+: name-table-entry ( addr -- group-index group-name )
+ [ <alien> 1 alien-unsigned-1 ] [ 2 + <alien> utf8 alien>string ] bi ;
+
+: options ( pcre -- opts )
+ f PCRE_INFO_OPTIONS fullinfo ;
+
+! Exported
+: name-table-entries ( pcre extra -- addrs )
+ [ name-table ] [ name-entry-size ] [ name-count ] 2tri gen-array-addrs
+ [ name-table-entry 2array ] map ;
--- /dev/null
+USING:
+ accessors
+ arrays
+ assocs
+ http.client
+ kernel
+ math math.ranges
+ pcre pcre.ffi pcre.info
+ random
+ sequences
+ system
+ tools.test ;
+QUALIFIED: regexp
+IN: pcre.tests
+
+CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})"
+
+! On windows the erroffset appears to be set to 0 despite there being
+! nothing wrong with the regexp.
+[ t ] [
+ "foo" (pcre) 3array 1 tail { { f -1 } { f 0 } } member?
+] unit-test
+
+[ { 1 2 3 } ] [
+ iso-date <pcre>
+ { "year" "month" "day" } [ pcre_get_stringnumber ] with map
+] unit-test
+
+[ t ] [ "foo" <compiled-pcre> PCRE_UTF8 has-option? ] unit-test
+
+os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
+
+! In this day and age, not supporting utf-8 is broken.
+[ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test
+
+[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test
+
+! Tests for findall
+[
+ { { f "1999-01-12" } { "year" "1999" } { "month" "01" } { "day" "12" } }
+] [
+ "1999-01-12" iso-date <compiled-pcre> findall first
+] unit-test
+
+[ 3 ] [
+ "2003-10-09 1999-09-01 1514-10-20" iso-date <compiled-pcre> findall length
+] unit-test
+
+[ 5 ] [ "abcdef" "[a-e]" findall length ] unit-test
+
+[ 3 ] [ "foo bar baz" "foo|bar|baz" findall length ] unit-test
+
+[ 3 ] [ "örjan är åtta" "[åäö]" findall length ] unit-test
+
+[ 3 ] [ "ÅÄÖ" "\\p{Lu}" findall length ] unit-test
+
+[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
+
+[ { { { f ", " } } { { f ", " } } { { f "." } } } ] [
+ "Words, words, word." "\\W+" findall
+] unit-test
+
+[ { ", " ", " "." } ] [
+ "Words, words, word." "\\W+" findall [ first second ] map
+] unit-test
+
+: long-string ( -- x )
+ 10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ;
+
+! Performance
+[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test
+
+! Empty matches, corner case behaviour is copied from pcredemo.c
+[ { { { f "foo" } } { { f "" } } } ]
+[ "foo" ".*" findall ] unit-test
+
+[ { { { f "" } } { { f "" } } { { f "" } } } ]
+[ "foo" "B*" findall ] unit-test
+
+! Empty matches in strings with multi-byte characters are tricky.
+[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
+[ "öööö" "x*" findall ] unit-test
+
+! Tests for matches?
+[ t ] [ "örjan" "örjan" matches? ] unit-test
+
+[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test
+
+! Dotall mode, off by default
+[ f ] [ "." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
+[ t ] [ "(?s)." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
+
+[ f ] [ "\n" "." matches? ] unit-test
+[ t ] [ "\n" "(?s)." matches? ] unit-test
+
+! Caseless mode, off by default
+[ { f t } ] [
+ { "x" "(?i)x" } [ <compiled-pcre> PCRE_CASELESS has-option? ] map
+] unit-test
+
+! Backreferences
+[ { t f } ] [
+ { "response and responsibility" "sense and responsibility" }
+ [ "(sens|respons)e and \\1ibility" matches? ] map
+] unit-test
+
+[ { t t f } ] [
+ { "rah rah" "RAH RAH" "RAH rah" } [ "((?i)rah)\\s+\\1" matches? ] map
+] unit-test
+
+! Splitting
+[ { { "Words" "words" "word" } { "Words" "words" "word" } } ] [
+ "Words, words, word." { "\\W+" "[,. ]" } [ split ] with map
+] unit-test
+
+! Bigger tests
+[ t ] [
+ "http://factorcode.org/" http-get nip
+ "href=\"(?P<link>[^\"]+)\"" findall [ "link" of ] map sequence?
+] unit-test
+
+! Test that the regexp syntax works.
+[ t ] [ "1234abcd" regexp:R[ ^\d+\w+$] matches? ] unit-test
--- /dev/null
+USING:
+ accessors
+ alien.c-types alien.data alien.enums alien.strings
+ arrays
+ assocs
+ io.encodings.utf8 io.encodings.string
+ kernel
+ math
+ mirrors
+ pcre.ffi pcre.info pcre.utils
+ sequences sequences.generalizations
+ strings ;
+QUALIFIED: regexp
+IN: pcre
+
+ERROR: malformed-regexp expr error ;
+ERROR: pcre-error value ;
+
+TUPLE: compiled-pcre pcre extra nametable ;
+
+: default-opts ( -- opts )
+ PCRE_UTF8 PCRE_UCP bitor ;
+
+: (pcre) ( expr -- pcre err-message err-offset )
+ default-opts { c-string int } [ f pcre_compile ] with-out-parameters ;
+
+: <pcre> ( expr -- pcre )
+ dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
+
+: exec ( pcre extra subject ofs opts -- count match-data )
+ [ dup length ] 2dip 30 int <c-array> 30 [ pcre_exec ] 2keep drop ;
+
+: <pcre-extra> ( pcre -- pcre-extra )
+ 0 { c-string } [ pcre_study ] with-out-parameters drop ;
+
+: config ( what -- alien )
+ { int } [ pcre_config ] with-out-parameters ;
+
+! Finding stuff
+TUPLE: matcher pcre extra subject ofs exec-opts match ;
+
+: <matcher> ( subject compiled-pcre -- matcher )
+ [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ;
+
+! This handling of zero-length matches is taken from pcredemo.c
+: empty-match-opts ( -- opts )
+ PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED bitor ;
+
+: findnext ( matcher -- matcher'/f )
+ clone dup <mirror> values 6 firstn drop exec
+ over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when
+ -1 =
+ [
+ 2drop dup exec-opts>> 0 =
+ [ drop f ]
+ [
+ dup [ subject>> ] [ ofs>> ] bi next-utf8-char
+ [ >>ofs 0 >>exec-opts findnext ] [ drop f ] if*
+ ] if
+ ]
+ [
+ [ 2array >>match ]
+ [
+ nip
+ [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ]
+ [ second >>ofs ] bi
+ ] 2bi
+ ] if ;
+
+! Result parsing
+: substring-list ( subject match-array count -- alien )
+ { void* } [ pcre_get_substring_list drop ] with-out-parameters ;
+
+: parse-match ( subject nametable match-data -- match )
+ swapd first2 swap [ substring-list ] keep void* <c-direct-array>
+ [ utf8 alien>string ] { } map-as [ of swap 2array ] with map-index ;
+
+! High-level
+: <compiled-pcre> ( expr -- compiled-pcre )
+ <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
+
+: has-option? ( compiled-pcre option -- ? )
+ [ pcre>> options ] dip bitand 0 > ;
+
+GENERIC: findall ( subject obj -- matches )
+
+M: compiled-pcre findall
+ [ <matcher> [ findnext ] follow [ match>> ] map harvest ]
+ [ nametable>> rot [ parse-match ] 2with map ] 2bi >array ;
+
+M: string findall
+ <compiled-pcre> findall ;
+
+M: regexp:regexp findall
+ raw>> findall ;
+
+: matches? ( subject obj -- ? )
+ dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
+
+: split ( subject obj -- strings )
+ dupd findall [ first second ] map split-subseqs ;
--- /dev/null
+USING: pcre.utils tools.test ;
+IN: pcre.utils.tests
+
+[ { "Bords" "words" "word" } ] [
+ "Bords, words, word." { ", " ", " "." } split-subseqs
+] unit-test
--- /dev/null
+USING: assocs fry kernel math mirrors sequences splitting strings ;
+IN: pcre.utils
+
+: replace-all ( seq subseqs new -- seq )
+ swapd '[ _ replace ] reduce ;
+
+: split-subseqs ( seq subseqs -- seqs )
+ dup first [ replace-all ] keep split-subseq [ >string ] map harvest ;
+
+: 2with ( param1 param2 obj quot -- obj curry )
+ [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
+
+: gen-array-addrs ( base size n -- addrs )
+ iota [ * + ] 2with map ;
+
+: utf8-start-byte? ( byte -- ? )
+ 0xc0 bitand 0x80 = not ;
+
+: next-utf8-char ( byte-array pos -- pos' )
+ 1 + 2dup swap ?nth
+ [ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ;