basis/modern/modern.factor

   1 ! Copyright (C) 2016 Doug Coleman.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: arrays assocs combinators combinators.short-circuit
   4 continuations fry io.encodings.utf8 io.files kernel locals make
   5 math math.order modern.paths modern.slices sequences
   6 sequences.extras sets splitting strings unicode vocabs.loader ;
   7 IN: modern
   8
   9 ERROR: string-expected-got-eof n string ;
  10 ERROR: long-opening-mismatch tag open n string ch ;
  11
  12 ! (( )) [[ ]] {{ }}
  13 MACRO:: read-double-matched ( open-ch -- quot: ( n string tag ch -- n' string seq ) )
  14     open-ch dup matching-delimiter {
  15         [ drop 2 swap <string> ]
  16         [ drop 1string ]
  17         [ nip 2 swap <string> ]
  18     } 2cleave :> ( openstr2 openstr1 closestr2 )
  19     [| n string tag! ch |
  20         ch {
  21             { CHAR: = [
  22                 tag 1 cut-slice* drop tag! ! tag of (=( is ( here, fix it
  23                 n string openstr1 slice-til-separator-inclusive [ -1 modify-from ] dip :> ( n' string' opening ch )
  24                 ch open-ch = [ tag openstr2 n string ch long-opening-mismatch ] unless
  25                 opening matching-delimiter-string :> needle
  26
  27                 n' string' needle slice-til-string :> ( n'' string'' payload closing )
  28                 n'' string
  29                 tag opening payload closing 4array
  30             ] }
  31             { open-ch [
  32                 tag 1 cut-slice* swap tag! 1 modify-to :> opening
  33                 n 1 + string closestr2 slice-til-string :> ( n' string' payload closing )
  34                 n' string
  35                 tag opening payload closing 4array
  36             ] }
  37             [ [ tag openstr2 n string ] dip long-opening-mismatch ]
  38         } case
  39      ] ;
  40
  41 : read-double-matched-paren ( n string tag ch -- n' string seq ) CHAR: ( read-double-matched ;
  42 : read-double-matched-bracket ( n string tag ch -- n' string seq ) CHAR: [ read-double-matched ;
  43 : read-double-matched-brace ( n string tag ch -- n' string seq ) CHAR: { read-double-matched ;
  44
  45 DEFER: lex-factor-top
  46 DEFER: lex-factor
  47 ERROR: lex-expected-but-got-eof n string expected ;
  48 ! For implementing [ { (
  49 : lex-until ( n string tag-sequence -- n' string payload )
  50     3dup '[
  51         [
  52             lex-factor-top dup f like [ , ] when* [
  53                 dup [
  54                     ! } gets a chance, but then also full seq { } after recursion...
  55                     [ _ ] dip '[ _ sequence= ] any? not
  56                 ] [
  57                     drop t ! loop again?
  58                 ] if
  59             ] [
  60                 _ _ _ lex-expected-but-got-eof
  61             ] if*
  62         ] loop
  63     ] { } make ;
  64
  65 DEFER: section-close?
  66 DEFER: upper-colon?
  67 DEFER: lex-factor-nested
  68 : lex-colon-until ( n string tag-sequence -- n' string payload )
  69     '[
  70         [
  71             lex-factor-nested dup f like [ , ] when* [
  72                 dup [
  73                     ! This is for ending COLON: forms like ``A: PRIVATE>``
  74                     dup section-close? [
  75                         drop f
  76                     ] [
  77                         ! } gets a chance, but then also full seq { } after recursion...
  78                         [ _ ] dip '[ _ sequence= ] any? not
  79                     ] if
  80                 ] [
  81                     drop t ! loop again?
  82                 ] if
  83             ] [
  84                 f
  85             ] if*
  86         ] loop
  87     ] { } make ;
  88
  89 : split-double-dash ( seq -- seqs )
  90     dup [ { [ "--" sequence= ] } 1&& ] split-when
  91     dup length 1 > [ nip ] [ drop ] if ;
  92
  93 MACRO:: read-matched ( ch -- quot: ( n string tag -- n' string slice' ) )
  94     ch dup matching-delimiter {
  95         [ drop "=" swap prefix ]
  96         [ nip 1string ]
  97     } 2cleave :> ( openstreq closestr1 )  ! [= ]
  98     [| n string tag |
  99         n string tag
 100         2over nth-check-eof {
 101             { [ dup openstreq member? ] [ ch read-double-matched ] } ! (=( or ((
 102             { [ dup blank? ] [
 103                 drop dup '[ _ matching-delimiter-string closestr1 2array members lex-until ] dip
 104                 swap unclip-last 3array ] } ! ( foo )
 105             [ drop [ slice-til-whitespace drop ] dip span-slices ]  ! (foo)
 106         } cond
 107     ] ;
 108
 109 : read-bracket ( n string slice -- n' string slice' ) CHAR: [ read-matched ;
 110 : read-brace ( n string slice -- n' string slice' ) CHAR: { read-matched ;
 111 : read-paren ( n string slice -- n' string slice' ) CHAR: ( read-matched ;
 112 : read-string-payload ( n string -- n' string )
 113     over [
 114         { CHAR: \\ CHAR: \" } slice-til-separator-inclusive {
 115             { f [ drop ] }
 116             { CHAR: \" [ drop ] }
 117             { CHAR: \\ [ drop next-char-from drop read-string-payload ] }
 118         } case
 119     ] [
 120         string-expected-got-eof
 121     ] if ;
 122
 123 :: read-string ( n string tag -- n' string seq )
 124     n string read-string-payload drop :> n'
 125     n' string
 126     n' [ n string string-expected-got-eof ] unless
 127     n n' 1 - string <slice>
 128     n' 1 - n' string <slice>
 129     tag -rot 3array ;
 130
 131 : take-comment ( n string slice -- n' string comment )
 132     2over ?nth CHAR: [ = [
 133         [ 1 + ] 2dip 2over ?nth read-double-matched-bracket
 134     ] [
 135         [ slice-til-eol drop ] dip swap 2array
 136     ] if ;
 137
 138 : terminator? ( slice -- ? )
 139     {
 140         [ ";" sequence= ]
 141         [ "]" sequence= ]
 142         [ "}" sequence= ]
 143         [ ")" sequence= ]
 144     } 1|| ;
 145
 146 ERROR: expected-length-tokens n string length seq ;
 147 : ensure-no-false ( n string seq -- n string seq )
 148     dup [ length 0 > ] all? [ [ length ] keep expected-length-tokens ] unless ;
 149
 150 ERROR: token-expected n string obj ;
 151 ERROR: unexpected-terminator n string slice ;
 152 : read-lowercase-colon ( n string slice -- n' string lowercase-colon )
 153     dup [ CHAR: : = ] count-tail
 154     '[
 155         _ [ lex-factor ] replicate ensure-no-false dup [ token-expected ] unless
 156         dup terminator? [ unexpected-terminator ] when
 157     ] dip swap 2array ;
 158
 159 : (strict-upper?) ( string -- ? )
 160     {
 161         ! All chars must...
 162         [
 163             [
 164                 { [ CHAR: A CHAR: Z between? ] [ "':-\\#" member? ] } 1||
 165             ] all?
 166         ]
 167         ! At least one char must...
 168         [ [ { [ CHAR: A CHAR: Z between? ] [ CHAR: ' = ] } 1|| ] any? ]
 169     } 1&& ;
 170
 171 : strict-upper? ( string -- ? )
 172     { [ ":" sequence= ] [ (strict-upper?) ] } 1|| ;
 173
 174 ! <A <A: but not <A>
 175 : section-open? ( string -- ? )
 176     {
 177         [ "<" head? ]
 178         [ length 2 >= ]
 179         [ rest strict-upper? ]
 180         [ ">" tail? not ]
 181     } 1&& ;
 182
 183 : html-self-close? ( string -- ? )
 184     {
 185         [ "<" head? ]
 186         [ length 2 >= ]
 187         [ rest strict-upper? not ]
 188         [ [ blank? ] any? not ]
 189         [ "/>" tail? ]
 190     } 1&& ;
 191
 192 : html-full-open? ( string -- ? )
 193     {
 194         [ "<" head? ]
 195         [ length 2 >= ]
 196         [ second CHAR: / = not ]
 197         [ rest strict-upper? not ]
 198         [ [ blank? ] any? not ]
 199         [ ">" tail? ]
 200     } 1&& ;
 201
 202 : html-half-open? ( string -- ? )
 203     {
 204         [ "<" head? ]
 205         [ length 2 >= ]
 206         [ second CHAR: / = not ]
 207         [ rest strict-upper? not ]
 208         [ [ blank? ] any? not ]
 209         [ ">" tail? not ]
 210     } 1&& ;
 211
 212 : html-close? ( string -- ? )
 213     {
 214         [ "</" head? ]
 215         [ length 2 >= ]
 216         [ rest strict-upper? not ]
 217         [ [ blank? ] any? not ]
 218         [ ">" tail? ]
 219     } 1&& ;
 220
 221 : special-acute? ( string -- ? )
 222     {
 223         [ section-open? ]
 224         [ html-self-close? ]
 225         [ html-full-open? ]
 226         [ html-half-open? ]
 227         [ html-close? ]
 228     } 1|| ;
 229
 230 : upper-colon? ( string -- ? )
 231     dup { [ length 0 > ] [ [ CHAR: : = ] all? ] } 1&& [
 232         drop t
 233     ] [
 234         {
 235             [ length 2 >= ]
 236             [ "\\" head? not ] ! XXX: good?
 237             [ ":" tail? ]
 238             [ dup [ CHAR: : = ] find drop head strict-upper? ]
 239         } 1&&
 240     ] if ;
 241
 242 : section-close? ( string -- ? )
 243     {
 244         [ length 2 >= ]
 245         [ "\\" head? not ] ! XXX: good?
 246         [ ">" tail? ]
 247         [
 248             {
 249                 [ but-last strict-upper? ]
 250                 [ { [ ";" head? ] [ rest but-last strict-upper? ] } 1&& ]
 251             } 1||
 252         ]
 253     } 1&& ;
 254
 255 : read-til-semicolon ( n string slice -- n' string semi )
 256     dup '[ but-last ";" append ";" 2array { "--" ")" } append lex-colon-until ] dip
 257     swap
 258     ! What ended the FOO: .. ; form?
 259     ! Remove the ; from the payload if present
 260     ! XXX: probably can remove this, T: is dumb
 261     ! Also in stack effects ( T: int -- ) can be ended by -- and )
 262     dup ?last {
 263         { [ dup ";" sequence= ] [ drop unclip-last 3array ] }
 264         { [ dup ";" tail? ] [ drop unclip-last 3array ] }
 265         { [ dup "--" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
 266         { [ dup "]" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
 267         { [ dup "}" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
 268         { [ dup ")" sequence= ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] } ! (n*quot) breaks
 269         { [ dup section-close? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
 270         { [ dup upper-colon? ] [ drop unclip-last -rot 2array [ rewind-slice ] dip ] }
 271         [ drop 2array ]
 272     } cond ;
 273
 274 ERROR: colon-word-must-be-all-uppercase-or-lowercase n string word ;
 275 : read-colon ( n string slice -- n' string colon )
 276     {
 277         { [ dup strict-upper? ] [ read-til-semicolon ] }
 278         { [ dup ":" tail? ] [ dup ":" head? [ read-lowercase-colon ] unless ] } ! :foo: vs foo:
 279         [ ]
 280     } cond ;
 281
 282 : read-acute-html ( n string slice -- n' string acute )
 283     {
 284         ! <FOO <FOO:
 285         { [ dup section-open? ] [
 286             [
 287                 matching-section-delimiter 1array lex-until
 288             ] keep swap unclip-last 3array
 289         ] }
 290         ! <foo/>
 291         { [ dup html-self-close? ] [
 292             ! do nothing special
 293         ] }
 294         ! <foo>
 295         { [ dup html-full-open? ] [
 296             dup [
 297                 rest-slice
 298                 dup ">" tail? [ but-last-slice ] when
 299                 "</" ">" surround 1array lex-until unclip-last
 300             ] dip -rot 3array
 301         ] }
 302         ! <foo
 303         { [ dup html-half-open? ] [
 304             ! n seq slice
 305             [ { ">" "/>" } lex-until ] dip
 306             ! n seq slice2 slice
 307             over ">" sequence= [
 308                 "</" ">" surround array '[ _ lex-until ] dip unclip-last
 309                 -rot roll unclip-last [ 3array ] 2dip 3array
 310             ] [
 311                 ! self-contained
 312                 swap unclip-last 3array
 313             ] if
 314         ] }
 315         ! </foo>
 316         { [ dup html-close? ] [
 317             ! Do nothing
 318         ] }
 319         [ [ slice-til-whitespace drop ] dip span-slices ]
 320     } cond ;
 321
 322 : read-acute ( n string slice -- n' string acute )
 323     [ matching-section-delimiter 1array lex-until ] keep swap unclip-last 3array ;
 324
 325 ! Words like append! and suffix! are allowed for now.
 326 : read-exclamation ( n string slice -- n' string obj )
 327     dup { [ "!" sequence= ] [ "#!" sequence= ] } 1||
 328     [ take-comment ] [ merge-slice-til-whitespace ] if ;
 329
 330 ERROR: no-backslash-payload n string slice ;
 331 : (read-backslash) ( n string slice -- n' string obj )
 332     merge-slice-til-whitespace dup "\\" tail? [
 333         ! \ foo, M\ foo
 334         dup [ CHAR: \\ = ] count-tail
 335         '[
 336             _ [ skip-blank-from slice-til-whitespace drop ] replicate
 337             ensure-no-false
 338             dup [ no-backslash-payload ] unless
 339         ] dip swap 2array
 340     ] when ;
 341
 342 DEFER: lex-factor-top*
 343 : read-backslash ( n string slice -- n' string obj )
 344     ! foo\ so far, could be foo\bar{
 345     ! remove the \ and continue til delimiter/eof
 346     [ "\"!:[{(<>\s\r\n" slice-til-either ] dip swap [ span-slices ] dip
 347     over "\\" head? [
 348         drop
 349         ! \ foo
 350         dup [ CHAR: \\ = ] all? [ (read-backslash) ] [ merge-slice-til-whitespace ] if
 351     ] [
 352         ! foo\ or foo\bar (?)
 353         over "\\" tail? [ drop (read-backslash) ] [ lex-factor-top* ] if
 354     ] if ;
 355
 356 ! If the slice is 0 width, we stopped on whitespace.
 357 ! Advance the index and read again!
 358
 359 : read-token-or-whitespace-top ( n string slice -- n' string slice/f )
 360     dup length 0 = [ [ 1 + ] 2dip drop lex-factor-top ] when ;
 361
 362 : read-token-or-whitespace-nested ( n string slice -- n' string slice/f )
 363     dup length 0 = [ [ 1 + ] 2dip drop lex-factor-nested ] when ;
 364
 365 : lex-factor-fallthrough ( n/f string slice/f ch/f -- n'/f string literal )
 366     {
 367         { CHAR: \ [ read-backslash ] }
 368         { CHAR: [ [ read-bracket ] }
 369         { CHAR: { [ read-brace ] }
 370         { CHAR: ( [ read-paren ] }
 371         { CHAR: ] [ ] }
 372         { CHAR: } [ ] }
 373         { CHAR: ) [ ] }
 374         { CHAR: " [ read-string ] }
 375         { CHAR: ! [ read-exclamation ] }
 376         { CHAR: > [
 377             [ [ CHAR: > = not ] slice-until ] dip merge-slices
 378             dup section-close? [
 379                 [ slice-til-whitespace drop ] dip ?span-slices
 380             ] unless
 381         ] }
 382         { f [ ] }
 383     } case ;
 384
 385 ! Inside a FOO: or a <FOO FOO>
 386 : lex-factor-nested* ( n/f string slice/f ch/f -- n'/f string literal )
 387     {
 388         ! Nested ``A: a B: b`` so rewind and let the parser get it top-level
 389         { CHAR: : [
 390             ! A: B: then interrupt the current parser
 391             ! A: b: then keep going
 392             merge-slice-til-whitespace
 393             dup { [ upper-colon? ] [ ":" = ] } 1||
 394             ! dup upper-colon?
 395             [ rewind-slice f ]
 396             [ read-colon ] if
 397         ] }
 398         { CHAR: < [
 399             ! FOO: a b <BAR: ;BAR>
 400             ! FOO: a b <BAR BAR>
 401             ! FOO: a b <asdf>
 402             ! FOO: a b <asdf asdf>
 403
 404             ! if we are in a FOO: and we hit a <BAR or <BAR:
 405             ! then end the FOO:
 406             ! Don't rewind for a <foo/> or <foo></foo>
 407             [ slice-til-whitespace drop ] dip span-slices
 408             dup section-open? [ rewind-slice f ] when
 409         ] }
 410         { CHAR: \s [ read-token-or-whitespace-nested ] }
 411         { CHAR: \r [ read-token-or-whitespace-nested ] }
 412         { CHAR: \n [ read-token-or-whitespace-nested ] }
 413         [ lex-factor-fallthrough ]
 414     } case ;
 415
 416 : lex-factor-nested ( n/f string -- n'/f string literal )
 417     ! skip-whitespace
 418     "\"\\!:[{(]})<>\s\r\n" slice-til-either
 419     lex-factor-nested* ; inline
 420
 421 : lex-factor-top* ( n/f string slice/f ch/f -- n'/f string literal )
 422     {
 423         { CHAR: : [ merge-slice-til-whitespace read-colon ] }
 424         { CHAR: < [
 425             ! FOO: a b <BAR: ;BAR>
 426             ! FOO: a b <BAR BAR>
 427             ! FOO: a b <asdf>
 428             ! FOO: a b <asdf asdf>
 429
 430             ! if we are in a FOO: and we hit a <BAR or <BAR:
 431             ! then end the FOO:
 432             [ slice-til-whitespace drop ] dip span-slices
 433             ! read-acute-html
 434             dup section-open? [ read-acute ] when
 435         ] }
 436
 437         { CHAR: \s [ read-token-or-whitespace-top ] }
 438         { CHAR: \r [ read-token-or-whitespace-top ] }
 439         { CHAR: \n [ read-token-or-whitespace-top ] }
 440         [ lex-factor-fallthrough ]
 441     } case ;
 442
 443 : lex-factor-top ( n/f string -- n'/f string literal )
 444     ! skip-whitespace
 445     "\"\\!:[{(]})<>\s\r\n" slice-til-either
 446     lex-factor-top* ; inline
 447
 448 ERROR: compound-syntax-disallowed n seq obj ;
 449 : check-for-compound-syntax ( n/f seq obj -- n/f seq obj )
 450     dup length 1 > [ compound-syntax-disallowed ] when ;
 451
 452 : check-compound-loop ( n/f string -- n/f string ? )
 453     [ ] [ peek-from ] [ previous-from ] 2tri
 454     [ blank? ] bi@ or not ! no blanks between tokens
 455     pick and ; ! and a valid index
 456
 457 : lex-factor ( n/f string/f -- n'/f string literal/f )
 458     [
 459         ! Compound syntax loop
 460         [
 461             lex-factor-top f like [ , ] when*
 462             ! concatenated syntax ( a )[ a 1 + ]( b )
 463             check-compound-loop
 464         ] loop
 465     ] { } make
 466     check-for-compound-syntax
 467     ! concat ! "ALIAS: n*quot (n*quot)" string>literals ... breaks here
 468     ?first f like ;
 469
 470 : string>literals ( string -- sequence )
 471     [ 0 ] dip [
 472         [ lex-factor [ , ] when* over ] loop
 473     ] { } make 2nip ;
 474
 475 : vocab>literals ( vocab -- sequence )
 476     ".private" ?tail drop
 477     vocab-source-path utf8 file-contents string>literals ;
 478
 479 : path>literals ( path -- sequence )
 480     utf8 file-contents string>literals ;
 481
 482 : lex-paths ( vocabs -- assoc )
 483     [ [ path>literals ] [ nip ] recover ] map-zip ;
 484
 485 : lex-vocabs ( vocabs -- assoc )
 486     [ [ vocab>literals ] [ nip ] recover ] map-zip ;
 487
 488 : failed-lexing ( assoc -- assoc' ) [ nip array? ] assoc-reject ;
 489
 490 : lex-core ( -- assoc ) core-vocabs lex-vocabs ;
 491 : lex-basis ( -- assoc ) basis-vocabs lex-vocabs ;
 492 : lex-extra ( -- assoc ) extra-vocabs lex-vocabs ;
 493 : lex-roots ( -- assoc ) lex-core lex-basis lex-extra 3append ;
 494
 495 : lex-docs ( -- assoc ) all-docs-paths lex-paths ;
 496 : lex-tests ( -- assoc ) all-tests-paths lex-paths ;
 497
 498 : lex-all ( -- assoc )
 499     lex-roots lex-docs lex-tests 3append ;