From: John Benediktsson Date: Thu, 21 Mar 2019 20:29:15 +0000 (-0700) Subject: regexp: try again to fix the issue with backslashes. X-Git-Tag: 0.99~3894 X-Git-Url: https://gitweb.factorcode.org/gitweb.cgi?p=factor.git;a=commitdiff_plain;h=26abdf4a23def8a62513d60d96dade8554ef9c0e regexp: try again to fix the issue with backslashes. the new simple rule is inside R/ syntax all backslashes should be escaped to allow easy regexp literal tokenization: R/ [\/]/ but in the constructor they shouldn't be: "[/]" When the regexp is prettyprinted we make sure to visually escape the backslashes in the raw regexp. --- diff --git a/basis/globs/globs.factor b/basis/globs/globs.factor index 3a540e2b23..e2bbb99aee 100644 --- a/basis/globs/globs.factor +++ b/basis/globs/globs.factor @@ -7,10 +7,10 @@ unicode multiline ; IN: globs : not-path-separator ( -- sep ) - os windows? R/ [^\\/\\]/ R/ [^\\/]/ ? ; foldable + os windows? R/ [^\/\\]/ R/ [^\/]/ ? ; foldable : wild-path-separator ( -- sep ) - os windows? R/ [^\\/\\][\\/\\]|[^\\/\\]/ R/ [^\\/][\\/]|[^\\/]/ ? ; foldable + os windows? R/ [^\/\\][\/\\]|[^\/\\]/ R/ [^\/][\/]|[^\/]/ ? ; foldable EBNF: [=[ diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 931843542c..f28aa61c39 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -3,7 +3,8 @@ USING: accessors arrays assocs combinators combinators.short-circuit interval-maps kernel locals math.parser memoize multiline peg.ebnf regexp.ast regexp.classes -sequences sets splitting strings unicode unicode.data unicode.script ; +sequences sets splitting strings unicode unicode.data +unicode.script ; IN: regexp.parser : allowed-char? ( ch -- ? ) @@ -70,13 +71,14 @@ MEMO: simple-category-table ( -- table ) : lookup-escape ( char -- ast ) { - { CHAR: t [ CHAR: \t ] } + { CHAR: a [ CHAR: \a ] } + { CHAR: e [ CHAR: \e ] } + { CHAR: f [ CHAR: \f ] } { CHAR: n [ CHAR: \n ] } { CHAR: r [ CHAR: \r ] } - { CHAR: f [ 0xc ] } - { CHAR: a [ 0x7 ] } - { CHAR: e [ 0x1b ] } - { CHAR: \\ [ CHAR: \\ ] } + { CHAR: t [ CHAR: \t ] } + { CHAR: v [ CHAR: \v ] } + { CHAR: 0 [ CHAR: \0 ] } { CHAR: w [ c-identifier-class ] } { CHAR: W [ c-identifier-class ] } diff --git a/basis/regexp/prettyprint/prettyprint.factor b/basis/regexp/prettyprint/prettyprint.factor index 372ef69062..ae3877c7d7 100644 --- a/basis/regexp/prettyprint/prettyprint.factor +++ b/basis/regexp/prettyprint/prettyprint.factor @@ -7,7 +7,7 @@ IN: regexp.prettyprint M: regexp pprint* [ [ - [ raw>> "\\/" "\\\\/" replace "R/ " % % "/" % ] + [ raw>> "/" "\\/" replace "R/ " % % "/" % ] [ options>> options>string % ] bi ] "" make ] keep present-text ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 30ec30a6d9..36e877a03f 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -49,6 +49,9 @@ IN: regexp.tests { t } [ "a" ".+" matches? ] unit-test { t } [ "ab" ".+" matches? ] unit-test +{ t } [ "\0" "[\\0]" matches? ] unit-test +{ f } [ "0" "[\\0]" matches? ] unit-test + { t } [ " " "[\\s]" matches? ] unit-test { f } [ "a" "[\\s]" matches? ] unit-test { f } [ " " "[\\S]" matches? ] unit-test @@ -335,6 +338,10 @@ unit-test { "XhXXlXlXoX XwXoXrXlXdX" } [ "hello world" R/ e*/ "X" re-replace ] unit-test { "-- title --" } [ "== title ==" R/ =/ "-" re-replace ] unit-test +{ "abc" } [ "a/ \\bc" "/.*\\" "" re-replace ] unit-test +{ "ac" } [ "a/ \\bc" R/ \/.*\\./ "" re-replace ] unit-test +{ "abc" } [ "a/ \\bc" R/ \/.*\\/ "" re-replace ] unit-test + { "" } [ "ab" "a(?!b)" first-match >string ] unit-test { "a" } [ "ac" "a(?!b)" first-match >string ] unit-test { t } [ "fxxbar" ".{3}(?!foo)bar" matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 19d2d8710b..c31571c718 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -200,10 +200,11 @@ PRIVATE> : take-until ( lexer -- string ) dup skip-blank [ dupd [ - [ CHAR: / -rot index-from ] keep - over [ "Unterminated regexp" throw ] unless - 2dup [ 1 - ] dip nth CHAR: \\ = - [ [ [ 1 + ] dip ] when ] keep + [ [ "\\/" member? ] find-from ] keep swap [ + CHAR: \ = [ [ 2 + ] dip t ] [ f ] if + ] [ + "Unterminated regexp" throw + ] if* ] loop over [ subseq ] dip 1 + ] change-lexer-column ; diff --git a/basis/validators/validators.factor b/basis/validators/validators.factor index c3b800ae57..cb3d2eca31 100644 --- a/basis/validators/validators.factor +++ b/basis/validators/validators.factor @@ -65,7 +65,7 @@ IN: validators v-regexp ; : v-url ( str -- str ) - "URL" R/ (?:ftp|http|https):\\/\\/\S+/ v-regexp ; + "URL" R/ (?:ftp|http|https):\/\/\S+/ v-regexp ; : v-captcha ( str -- str ) dup empty? [ "must remain blank" throw ] unless ; diff --git a/extra/metar/metar.factor b/extra/metar/metar.factor index e4a98b8e99..f01f538744 100644 --- a/extra/metar/metar.factor +++ b/extra/metar/metar.factor @@ -283,11 +283,11 @@ CONSTANT: sky H{ CONSTANT: re-timestamp R/ \d{6}Z/ CONSTANT: re-station R/ \w{4}/ -CONSTANT: re-temperature R/ [M]?\d{2}\\/([M]?\d{2})?/ +CONSTANT: re-temperature R/ [M]?\d{2}\/([M]?\d{2})?/ CONSTANT: re-wind R/ (VRB|\d{3})\d{2,3}(G\d{2,3})?KT/ CONSTANT: re-wind-variable R/ \d{3}V\d{3}/ -CONSTANT: re-visibility R/ [MP]?\d+(\\/\d+)?SM/ -CONSTANT: re-rvr R/ R\d{2}[RLC]?\\/\d{4}(V\d{4})?FT/ +CONSTANT: re-visibility R/ [MP]?\d+(\/\d+)?SM/ +CONSTANT: re-rvr R/ R\d{2}[RLC]?\/\d{4}(V\d{4})?FT/ CONSTANT: re-weather R/ [+-]?(VC)?(\w{2}|\w{4})/ CONSTANT: re-sky-condition R/ (\w{2,3}\d{3}(\w+)?|\w{3}|CAVOK)/ CONSTANT: re-altimeter R/ [AQ]\d{4}/ @@ -519,23 +519,23 @@ CONSTANT: re-recent-weather R/ ((\w{2})?[BE]\d{2,4}((\w{2})?[BE]\d{2,4})?)+/ { [ dup R/ 1\d{4}/ matches? ] [ parse-6hr-max-temp ] } { [ dup R/ 2\d{4}/ matches? ] [ parse-6hr-min-temp ] } { [ dup R/ 4\d{8}/ matches? ] [ parse-24hr-temp ] } - { [ dup R/ 4\\/\d{3}/ matches? ] [ parse-snow-depth ] } + { [ dup R/ 4\/\d{3}/ matches? ] [ parse-snow-depth ] } { [ dup R/ 5\d{4}/ matches? ] [ parse-1hr-pressure ] } - { [ dup R/ 6[\d\\/]{4}/ matches? ] [ parse-6hr-precipitation ] } + { [ dup R/ 6[\d\/]{4}/ matches? ] [ parse-6hr-precipitation ] } { [ dup R/ 7\d{4}/ matches? ] [ parse-24hr-precipitation ] } - { [ dup R/ 8\\/\d{3}/ matches? ] [ parse-cloud-cover ] } + { [ dup R/ 8\/\d{3}/ matches? ] [ parse-cloud-cover ] } { [ dup R/ 931\d{3}/ matches? ] [ parse-6hr-snowfall ] } { [ dup R/ 933\d{3}/ matches? ] [ parse-water-equivalent-snow ] } { [ dup R/ 98\d{3}/ matches? ] [ parse-duration-of-sunshine ] } { [ dup R/ T\d{4,8}/ matches? ] [ parse-1hr-temp ] } - { [ dup R/ \d{3}\d{2,3}\\/\d{2,4}/ matches? ] [ parse-peak-wind ] } + { [ dup R/ \d{3}\d{2,3}\/\d{2,4}/ matches? ] [ parse-peak-wind ] } { [ dup R/ P\d{4}/ matches? ] [ parse-1hr-precipitation ] } { [ dup R/ SLP\d{3}/ matches? ] [ parse-sea-level-pressure ] } { [ dup R/ LTG\w+/ matches? ] [ parse-lightning ] } { [ dup R/ PROB\d+/ matches? ] [ parse-probability ] } { [ dup R/ \d{3}V\d{3}/ matches? ] [ parse-varying ] } { [ dup R/ [^-]+(-[^-]+)+/ matches? ] [ parse-from-to ] } - { [ dup R/ [^\\/]+(\\/[^\\/]+)+/ matches? ] [ ] } + { [ dup R/ [^\/]+(\/[^\/]+)+/ matches? ] [ ] } { [ dup R/ \d+.\d+/ matches? ] [ ] } { [ dup re-recent-weather matches? ] [ parse-recent-weather ] } { [ dup re-weather matches? ] [ parse-weather ] } diff --git a/extra/xkcd/xkcd.factor b/extra/xkcd/xkcd.factor index f9a5f06b69..8b9546f64a 100644 --- a/extra/xkcd/xkcd.factor +++ b/extra/xkcd/xkcd.factor @@ -13,7 +13,7 @@ IN: xkcd : comic-image ( url -- image ) http-get nip - R/ \/\/imgs\.xkcd\.com\\/comics\\/[^\.]+\.(png|jpg)/ + R/ \/\/imgs\.xkcd\.com\/comics\/[^\.]+\.(png|jpg)/ first-match >string "http:" prepend load-http-image ; : comic-image. ( url -- )