Commit 9f65ef33bd96add8fb512fa3153bc1d76e8ff8f4
1 parent
159a6521
Revert "Definitive (I hope so) fast_lexer_5 with its documentation updated."
This reverts commit eace497398506d94a2b8b8725b6d1004f7275b18.
Showing
8 changed files
with
233 additions
and
376 deletions
Show diff stats
anubis_dev/compiler/src/predef.anubis
| ... | ... | @@ -720,9 +720,6 @@ public define One truncate(ByteArray s, |
| 720 | 720 | public define ByteArray extract(ByteArray s, |
| 721 | 721 | Int start, |
| 722 | 722 | Int end) = +++avm{ extract_byte_array }. |
| 723 | -public define One write(ByteArray src, | |
| 724 | - ByteArray dest, | |
| 725 | - Int where) = +++avm{ write_byte_array }. | |
| 726 | 723 | public define ByteArray ByteArray s + ByteArray t = +++avm{ concat_byte_array }. |
| 727 | 724 | public define String to_ascii(ByteArray s) = +++avm{ byte_array_to_ascii }. |
| 728 | 725 | public define String to_string(ByteArray s) = +++avm{ byte_array_to_string }. | ... | ... |
anubis_dev/include/bytecode.h
anubis_dev/include/minver.h
anubis_dev/library/data_base/read_csv.anubis
| ... | ... | @@ -9,14 +9,14 @@ read tools/time.anubis |
| 9 | 9 | read lexical_analysis/fast_lexer_4.anubis |
| 10 | 10 | |
| 11 | 11 | |
| 12 | + The function made by the function below reads a single record from a CSV input source. | |
| 13 | + | |
| 12 | 14 | public type ReadCsvResult: |
| 13 | 15 | end_of_input, |
| 14 | 16 | error (String message), // an error message |
| 15 | 17 | ok (Int offset, List(String) record). // a single record and the offset of the end of |
| 16 | 18 | // this record. |
| 17 | 19 | |
| 18 | - The function constructed by the function below reads a single record from a CSV input source. | |
| 19 | - | |
| 20 | 20 | public define One -> ReadCsvResult |
| 21 | 21 | make_read_csv_line |
| 22 | 22 | ( | ... | ... |
anubis_dev/library/lexical_analysis/fast_lexer_4.anubis
| ... | ... | @@ -12,8 +12,7 @@ |
| 12 | 12 | |
| 13 | 13 | This is the reason why this file, which replaces the previous fast_lexer_4 is in fact |
| 14 | 14 | just a compatibility file between fast_lexer_4 and fast_lexer_5. The old version |
| 15 | - of fast_lexer_4 is kept into fast_lexer_4.anubis.old, but you cannot use it because | |
| 16 | - it is incompatible with the stuff in predefined.anubis. | |
| 15 | + of fast_lexer_4 is kept into fast_lexer_4.anubis.old. | |
| 17 | 16 | |
| 18 | 17 | However, in order to compile a file using fast_lexer_4.anubis, you have to replace: |
| 19 | 18 | |
| ... | ... | @@ -69,25 +68,6 @@ public define LexerOutput($Token) |
| 69 | 68 | } |
| 70 | 69 | }. |
| 71 | 70 | |
| 72 | -public define LexerOutput($Token) | |
| 73 | - convert | |
| 74 | - ( | |
| 75 | - LexerResult(ExToken($Token),$Aux) r, | |
| 76 | - ) = | |
| 77 | - if r is | |
| 78 | - { | |
| 79 | - ignore then should_not_happen(end_of_input), // because fast_lexer_4 has no 'ignore with action' | |
| 80 | - error(e) then | |
| 81 | - if e is lex_error(b,tools,aux) then | |
| 82 | - error(b,line(tools)(unique),column(tools)(unique)), | |
| 83 | - | |
| 84 | - token(tok) then if tok is | |
| 85 | - { | |
| 86 | - end_of_input then end_of_input, // both 'end_of_input' are not of the same type | |
| 87 | - token(t) then token(t) // idem | |
| 88 | - } | |
| 89 | - }. | |
| 90 | - | |
| 91 | 71 | Conversion in the other direction: |
| 92 | 72 | |
| 93 | 73 | public define Result(LexicalError($Aux),ExToken($Token)) | ... | ... |
anubis_dev/library/lexical_analysis/fast_lexer_5.anubis
| ... | ... | @@ -22,16 +22,13 @@ |
| 22 | 22 | |
| 23 | 23 | $authorline(Author:) (Alain Prouté (2016 from previous versions 2008, 2013 and 2014).) |
| 24 | 24 | $authorline(Contributions by:) (Matthieu Herrmann (2014, 2016).) |
| 25 | - $authorline(Last revision:) (Dec. 2016) | |
| 25 | + $authorline(Last revision:) (nov. 2016) | |
| 26 | 26 | $par |
| 27 | 27 | |
| 28 | 28 | |
| 29 | 29 | This is the fifth version of this tool. The main novelties compared to previous versions are: |
| 30 | 30 | $list( |
| 31 | 31 | $item Automatic generation of a $em(multistate lexer) (gathering several $em(sublexers)). |
| 32 | - $item Possibility to switch the state of the lexer from within itself. | |
| 33 | - $item Possibility to 'ignore with action'. (These two points are useful for example for ignoring nested | |
| 34 | - left and right delimited comments.) | |
| 35 | 32 | $item The type schema $att(LexerOutput($Token)) is no more used. It is replaced by a type which makes the |
| 36 | 33 | lexer directly usable as an argument to your APG parser (if any). |
| 37 | 34 | $item This documentation written in MAML and included into the source file. |
| ... | ... | @@ -76,19 +73,7 @@ |
| 76 | 73 | The lexers can be constructed by this program in two different ways, either statically |
| 77 | 74 | (i.e. at compile time) or dynamically (i.e. at run time). Use the second possibility only |
| 78 | 75 | if the $em(regular expressions) (see the definition below) needed for constructing the lexer |
| 79 | - are not known at compile time.$p | |
| 80 | - | |
| 81 | - See also APG (the $em(Anubis Parser Generator)) in$par | |
| 82 | - $center($fname(library/syntactic_analysis/parser_generator.anubis)) | |
| 83 | - which is the companion tool to this one.$p | |
| 84 | - | |
| 85 | - Note: If you want to scan utf-8 texts you just have to design your regular expressions | |
| 86 | - accordingly. Indeed, the lexer considers all bytes from 0 to 255. Notice | |
| 87 | - that the lexer computes column numbers in utf-8 mode, i.e. each multibyte utf-8 character counts | |
| 88 | - for just one column. | |
| 89 | - | |
| 90 | - | |
| 91 | - | |
| 76 | + are not known at compile time. | |
| 92 | 77 | |
| 93 | 78 | $section(Regular expressions) |
| 94 | 79 | |
| ... | ... | @@ -291,12 +276,7 @@ public type LexingTools: |
| 291 | 276 | is to prepare the token after it is just recognized, i.e. to transform the token, which comes as a byte array, |
| 292 | 277 | into a datum of type $att($Token).$p |
| 293 | 278 | |
| 294 | - You can also get the tools associated to a lexing stream by using the following: | |
| 295 | - $ecode(public define LexingTools tools(LexingStream ls).) | |
| 296 | - This is useful for example if you want to transmit the lexing tools to an APG parser (through the | |
| 297 | - $att(extra datum); see APG documentation).$p | |
| 298 | - | |
| 299 | - The first three tools provide the line and column number of the current token together with its offset in | |
| 279 | + The first three tools provide the line and column number of the token together with its offset in | |
| 300 | 280 | the input stream.$p |
| 301 | 281 | |
| 302 | 282 | The last tool enables to go several bytes backward in the input. It is not garanteed that the lexer can go |
| ... | ... | @@ -309,7 +289,7 @@ public type LexingTools: |
| 309 | 289 | |
| 310 | 290 | |
| 311 | 291 | $section(Describing a single state lexer) |
| 312 | - As we shall see below, a lexer can have so-called $em(states). These are of course not the states of the constructed | |
| 292 | + As we shall see below, a lexer can have so called $em(states). These are of course not the states of the constructed | |
| 313 | 293 | automaton, but a notion similar to the LEX/FLEX notion of state. For the time being we consider lexer with only one |
| 314 | 294 | (default) state. |
| 315 | 295 | |
| ... | ... | @@ -338,7 +318,7 @@ public type LexingTools: |
| 338 | 318 | Hence, in most cases, it is preferable to write an APG file first, and |
| 339 | 319 | to define your lexer later on. $p |
| 340 | 320 | |
| 341 | - The type of tokens for a given lexer is represented in this documentation by the type parameter | |
| 321 | + The type of tokens for a given lexer is represented in this file by the type parameter | |
| 342 | 322 | $att($ Token). |
| 343 | 323 | |
| 344 | 324 | |
| ... | ... | @@ -355,12 +335,12 @@ public type LexicalError($Aux): |
| 355 | 335 | ) |
| 356 | 336 | The component $att(b) is the text which produced the error. From this and the components $att(t) and $att(a), you can |
| 357 | 337 | produce well documented error messages, containing for example a line number, a column number. You can also use the |
| 358 | - tool $att(back(t)) (if needed, and with care).$p | |
| 338 | + tool $att(t(back)) in order to come back to the beginning (for example) of the faulty text, change the state of the | |
| 339 | + lexer, and reread the token with an other sublexer.$p | |
| 359 | 340 | |
| 360 | - A lexer returns a datum of type:$par | |
| 341 | + A lexer returns a datum of type: | |
| 361 | 342 | $center($att(Result(LexicalError($Aux),$Token))) |
| 362 | - instead of $att(LexerOutput($Token)) which was the case | |
| 363 | - in $fname(fast_lexer_4). The advantage is that the constructed lexer is | |
| 343 | + instead of $att(LexerOutput($Token)) in $fname(fast_lexer_4). The advantage is that the constructed lexer is | |
| 364 | 344 | directly usable as an argument to parsers produced by APG.$p |
| 365 | 345 | |
| 366 | 346 | Actually, the main structural difference with $att(LexerOutput($Token)) which had a alternative named $att(end_of_input) is |
| ... | ... | @@ -370,6 +350,11 @@ public type LexicalError($Aux): |
| 370 | 350 | we shall see below, this function takes this token as an argument. If you don't use APG, you must create a type of |
| 371 | 351 | tokens with an alternative for representing the end of input.$p |
| 372 | 352 | |
| 353 | + Note: If you want to scan utf-8 texts you just have to design your regular expressions | |
| 354 | + accordingly. Indeed, the lexer considers all bytes from 0 to 255. Notice | |
| 355 | + that the lexer computes column numbers in utf-8 mode, i.e. each multibyte utf-8 character counts | |
| 356 | + for just one column. | |
| 357 | + | |
| 373 | 358 | |
| 374 | 359 | |
| 375 | 360 | $subsection(Actions) |
| ... | ... | @@ -377,9 +362,6 @@ public type LexicalError($Aux): |
| 377 | 362 | $acode( |
| 378 | 363 | public type LexerAction($Token,$Aux): |
| 379 | 364 | ignore, |
| 380 | - ignore((ByteArray token, | |
| 381 | - LexingTools tools, | |
| 382 | - $Aux aux) -> One), | |
| 383 | 365 | return((ByteArray token, |
| 384 | 366 | LexingTools tools, |
| 385 | 367 | $Aux aux) -> Result(LexicalError($Aux),$Token)), |
| ... | ... | @@ -391,37 +373,22 @@ public type LexerAction($Token,$Aux): |
| 391 | 373 | The first alternative $att(ignore) means that no action should be performed so that the recognized token is just |
| 392 | 374 | $em(ignored) (and the low level lexer keeps on reading without returning).$p |
| 393 | 375 | |
| 394 | - The second alternative also ignores the token but allows to perform an action. | |
| 395 | - The action is a function receiving the | |
| 376 | + The second alternative is used when a token is recognized and | |
| 377 | + an actual action must be performed. The action is a function receiving the | |
| 396 | 378 | following arguments: |
| 397 | 379 | $list( |
| 398 | 380 | $item the token in the form of a byte array, |
| 399 | 381 | $item the toolbox containing the lexing tools, |
| 400 | 382 | $item auxiliary data whose type is to be defined by yourself. |
| 401 | 383 | ) |
| 402 | - and it returns a datum | |
| 403 | - of type $att(One) because the lexer will not return anything. Instead, it will resume reading from | |
| 404 | - the input stream.$p | |
| 405 | - | |
| 406 | - $bold(Important note): If a token is to be ignored and no action is needed, use the first alternative, | |
| 407 | - not the second one (with a dummy action). Not doing so could dramatically degrade performances. This is | |
| 408 | - due to the fact that the first alternative doesn't make the low level lexer (defined in | |
| 409 | - $fname(predefined.anubis)) return.$p | |
| 410 | - | |
| 411 | - The third alternative is used when a token is recognized and | |
| 412 | - an actual action must be performed. The action is a function receiving the | |
| 413 | - same arguments as above. | |
| 414 | 384 | It must return the token as a datum of type $att(Result(LexicalError($Aux),$Token)). The reason why it doesn't just return |
| 415 | - a datum of type $att($Token) is that the function itself must be able to reject (for some reason of | |
| 416 | - your own) the recognized | |
| 385 | + a datum of type $att($Token) is that the function itself must be able to reject (for some reason) the recognized | |
| 417 | 386 | token, and to return an error (see example below).$p |
| 418 | 387 | |
| 419 | - The fourth alternative in $att(LexerAction($Token,$Aux)) is a variant of the third one. Instead of extracting | |
| 388 | + The third alternative in $att(LexerAction($Token,$Aux)) is a variant of the second one. Instead of extracting | |
| 420 | 389 | the token from the buffer, the function provides tools for extracting a part of the token. The argument |
| 421 | - $att(length) is the total length of the token. The function $att(extract) enables | |
| 422 | - to extract the part of the token | |
| 423 | - located between positions $att(s) (included) and $att(e) (not included), relative to the beginning of | |
| 424 | - the token. For example, | |
| 390 | + $att(length) is the total length of the token. The function $att(extract) enables to extract the part of the token | |
| 391 | + located between positions $att(s) (included) and $att(e) (not included), relative to the token. For example, | |
| 425 | 392 | $att(extract(0,length)) gives the whole token. |
| 426 | 393 | |
| 427 | 394 | |
| ... | ... | @@ -495,6 +462,17 @@ public type LexerItem($Token,$Aux): |
| 495 | 462 | for the users of your program. |
| 496 | 463 | |
| 497 | 464 | |
| 465 | + $subsection(Ignoring a token) | |
| 466 | + If you don't provide a function in a lexer item (using $att(ignore) instead of $att(return)), | |
| 467 | + the recognized token is just ignored and the lexer tries to read the next token. For example, | |
| 468 | + this may be used for ignoring white spaces. $p | |
| 469 | + | |
| 470 | + If you want to ignore a token and neverthelesss execute an action, use the $att(return) | |
| 471 | + alternative and discard the token instead of returning it to the parser. Don't use this | |
| 472 | + possibility when $att(ignore) can be used (i.e. if no action is required), because performances | |
| 473 | + could be dramatically degraded. Indeed, the Anubis virtual machine instruction implementing the low | |
| 474 | + level fast lexer doesn't return on an $att(ignore) and keeps on reading. | |
| 475 | + | |
| 498 | 476 | |
| 499 | 477 | $subsection(Putting lexer items in the right order) |
| 500 | 478 | The order of the lexer items in a lexer description can be important. The lexer can |
| ... | ... | @@ -555,13 +533,114 @@ public define Result(RegExprError, |
| 555 | 533 | |
| 556 | 534 | |
| 557 | 535 | |
| 536 | + | |
| 537 | + $section(Multistate lexers) | |
| 538 | + The LEX/FLEX software has a notion of $em(state) for lexers. This means that regular expressions are partitioned into | |
| 539 | + several (disjoint) sets, one for each $em(state), and that the lexer, when called in a given state, uses only those | |
| 540 | + regular expressions which belong to the set corresponding to this state.$p | |
| 541 | + | |
| 542 | + It is equivalent to say that the lexer is made of several $em(sublexers), one for each state, and that the $em(global | |
| 543 | + lexer) uses only one sublexer depending on its state. In this section, we describe how to create a $em(multistate) | |
| 544 | + lexer.$p | |
| 545 | + | |
| 546 | + $subsection(Describing a multistate lexer) | |
| 547 | + In order to create a multistate lexer, you first describe $em(several) sublexers, one for each state. Each sublexer is of | |
| 548 | + type $att(List(LexerItem($Token,$Aux))), in other words, it is described exactly in the same way as a single state | |
| 549 | + lexer. | |
| 550 | + The values of the type parameters $att($Token) and $att($Aux) must be | |
| 551 | + the same one for all sublexers. In other words, there is a unique token type for your multistate lexer, and also a | |
| 552 | + unique auxiliary data type.$p | |
| 553 | + | |
| 554 | + In order to associate a name to a sublexer description, we need the following type: | |
| 555 | + $acode( | |
| 556 | +public type SubLexer($Token,$Aux): | |
| 557 | + sublexer (String sublexer_name, | |
| 558 | + List(LexerItem($Token,$Aux)) description). | |
| 559 | + ) | |
| 560 | + where $att(sublexer_name) is the name you want to give to the corresponding sublexer. This name must be valid as an | |
| 561 | + Anubis symbol, because it is used as such in the generated file in case you want to precompile your lexer.$p | |
| 562 | + | |
| 563 | + $define(LexerState)(0)($ LexerState) | |
| 564 | + | |
| 565 | + Actually, what this program will do is to set (within a generated file) the definition of a | |
| 566 | + $em(type of states) for the lexer, whose name is constructed by prefixing $att(LexerState_) in front of | |
| 567 | + the name of your multistate lexer. This type has one alternative (without component) for each sublexer. | |
| 568 | + It is represented below by the type parameter $att($LexerState).$p | |
| 569 | + | |
| 570 | + In order to avoid a circularity problem with APG, the definition of the type $att(LexerState_<lexer name>) is placed | |
| 571 | + into a separate file whose name is $fname(<lexer name>_states.anubis). The rest of the generated lexer is placed into | |
| 572 | + the file $fname(<lexer name>.anubis). Indeed, APG generates the type of tokens which is to be used from within the | |
| 573 | + lexer, and fast_lexer_5 generates the type of states of the lexer which is to be used from within the APG grammar.$p | |
| 574 | + | |
| 575 | + Thanks to the above separation of the generated lexer into two generated files, the parser can $att(read) (actually | |
| 576 | + $att(transmit)) the file $fname(<lexer name>_states.anubis) and the lexer can $att(read) the declaration file | |
| 577 | + generated by APG without creating a circularity problem.$p | |
| 578 | + | |
| 579 | + If you are using APG, remember that you should produce all generated files (those generated by APG and thoses | |
| 580 | + generated by fast_lexer_5) before reading any of them. | |
| 581 | + | |
| 582 | + | |
| 583 | + $subsection(Creating a multistate lexer) | |
| 584 | + This done, you can construct your multistate lexer (at run time) as follows: | |
| 585 | + $acode( | |
| 586 | +public define Result(RegExprError, | |
| 587 | + ((LexingStream,$Aux) -> One -> Result(LexicalError($Aux),$Token), | |
| 588 | + $LexerState -> One)) | |
| 589 | + make_lexer | |
| 590 | + ( | |
| 591 | + String lexer_name, | |
| 592 | + String name_of_initial_sublexer, | |
| 593 | + List(SubLexer($Token,$Aux)) sublexers_descriptions, | |
| 594 | + Word8 escape_char // '#' recommanded here | |
| 595 | + ). | |
| 596 | + ) | |
| 597 | + The differencies with the previous $att(make_lexer) are: | |
| 598 | + $list( | |
| 599 | + $item after plugging the lexer onto a lexing stream, you get two functions instead of one. The first one is the | |
| 600 | + lexer itself of type $att(One -> Result(LexicalError($Aux),$Token)), and the second one of type $att($LexerState -> One) is the | |
| 601 | + command for changing the state of the lexer, | |
| 602 | + | |
| 603 | + $item $att(make_lexer) needs to know the names of the lexer and of the initial state (i.e. of the sublexer to used | |
| 604 | + in the first place), | |
| 605 | + | |
| 606 | + $item you must give a list of sublexers descriptions instead of a single lexer description. | |
| 607 | + ) | |
| 608 | + | |
| 609 | + | |
| 610 | + $subsection(Using a multistate lexer) | |
| 611 | + A multistate lexer is used exactly in the same way as a single state lexer. You just have the extra possibility to | |
| 612 | + change its state at will.$p | |
| 613 | + | |
| 614 | + If you are not using APG, you can change the state of the lexer between two calls.$p | |
| 615 | + | |
| 616 | + If you are using APG, the best is to change the state of the lexer from within the grammar rules. This can be done | |
| 617 | + by way of $em(immediate commands). In order to make the change of state function available in grammar rules, | |
| 618 | + you must transmit it within the so-called $em(extra datum) accepted by APG. See APG's documentation. | |
| 619 | + | |
| 620 | + | |
| 621 | + | |
| 622 | + | |
| 623 | + | |
| 624 | + $section(Computing a lexer at compile time) | |
| 625 | + If you do as explained above, your lexer is constructed at run time. If the | |
| 626 | + lexer description is already known at compile time, it is preferable to construct | |
| 627 | + the lexer at compile time. | |
| 628 | + | |
| 629 | + | |
| 558 | 630 | $subsection(Precompiling a single state lexer) |
| 559 | - If the description of your lexer is known at compile time, it is preferable to | |
| 560 | - $em(precompile) your lexer. This is done by the function $att(make_precompiled_lexer) | |
| 561 | - defined below. It creates an Anubis source file (that we call below the $em(generated file)) | |
| 562 | - containing your already compiled lexer, | |
| 563 | - but not containing the actions. We explain below how you can get your working | |
| 564 | - lexer from this file and the original lexer description.$p | |
| 631 | + In order to do that, write the following into your | |
| 632 | + source file: $label(precompilemylexer) | |
| 633 | + $ecode( global define One | |
| 634 | + precompile_my_lexer // of course, you can choose another name here | |
| 635 | + ( | |
| 636 | + List(String) _ // not used | |
| 637 | + ) = | |
| 638 | + make_precompiled_lexer(lexer_name,lexer_description,'#',end_of_input). | |
| 639 | + | |
| 640 | + execute anbexec precompile_my_lexer) | |
| 641 | + This creates an Anubis source file whose name is $fname(lexer_name.anubis) | |
| 642 | + within a subdirectory (of the current directory) named $fname(generated), which is created | |
| 643 | + if needed. This execution prints error messages (if any) on the standard output.$p | |
| 565 | 644 | |
| 566 | 645 | The function $att(make_precompiled_lexer) is declared as follows: |
| 567 | 646 | $acode( |
| ... | ... | @@ -572,35 +651,11 @@ public define One |
| 572 | 651 | List(LexerItem($Token,$Aux)) lexer_description, |
| 573 | 652 | Word8 escape_char, |
| 574 | 653 | String end_of_input |
| 575 | - ). ) | |
| 576 | - The argument $att(lexer_name) is the name of the lexer. It will become the name of the generated | |
| 577 | - file (with $fname(.anubis) appended). For example, if the first argument is $att("my_lexer"), the | |
| 578 | - name of the file will be $fname(my_lexer.anubis).$p | |
| 579 | - | |
| 580 | - The argument $att(end_of_input) is necessary because the | |
| 581 | - lexer needs to know which token is to be returned when the end of the input is encountered. | |
| 582 | - Notice that here, the end of input is not given as a token but as a character string. | |
| 583 | - This is because this string will | |
| 584 | - be printed into the generated file in order to make the actual token.$p | |
| 654 | + ). | |
| 655 | + ) | |
| 656 | + Notice that here the $att(end_of_input) is not given a s a token but as character string. This is because this string will | |
| 657 | + be printed into the generated file in order to make the actual token.$p | |
| 585 | 658 | |
| 586 | - | |
| 587 | - | |
| 588 | - The generated file is created by default in the subdirectory $fname(generated) of the current | |
| 589 | - directory (this subdirectory is automatically created if needed). | |
| 590 | - In order to produce this file, write the following into your | |
| 591 | - source file: $label(precompilemylexer) | |
| 592 | - $ecode(global define One | |
| 593 | - precompile_my_lexer // of course, you can choose another name here | |
| 594 | - ( | |
| 595 | - List(String) _ // not used | |
| 596 | - ) = | |
| 597 | - make_precompiled_lexer("my_lexer", | |
| 598 | - my_lexer_description, | |
| 599 | - '#', | |
| 600 | - "end_of_input").) | |
| 601 | - $ecode(execute anbexec precompile_my_lexer) | |
| 602 | - This execution prints error messages (if any) on the standard output.$p | |
| 603 | - | |
| 604 | 659 | If you want to create the target files in another directory than $fname(generated), use |
| 605 | 660 | the variant below: |
| 606 | 661 | $acode( |
| ... | ... | @@ -619,20 +674,20 @@ public define One |
| 619 | 674 | don't even need to have a look at it. |
| 620 | 675 | |
| 621 | 676 | $subsection(Plugging a precompiled single state lexer onto a lexing stream) |
| 622 | - In order to get your precompiled lexer in the form of | |
| 677 | + In order to get your lexer in the form of | |
| 623 | 678 | a function of type:$par |
| 624 | 679 | |
| 625 | 680 | $center($att((LexingStream,$Aux) -> One -> Result(LexicalError($Aux),$Token))) |
| 626 | 681 | |
| 627 | - as above, just write this (still assuming that the name of your lexer is $att(my_lexer)): | |
| 628 | - $ecode(read generated/my_lexer.anubis) | |
| 629 | - This defines a datum whose name is $att(my_lexer). This datum contains the compiled automaton itself, | |
| 682 | + as above, just write this: | |
| 683 | + $ecode(read generated/lexer_name.anubis) | |
| 684 | + This defines a datum whose name is $att(lexer_name). This datum contains the compiled automaton itself, | |
| 630 | 685 | but not the actions.$p |
| 631 | 686 | |
| 632 | - At the place in your program where you want to have your lexer, write this: | |
| 633 | - $ecode(retrieve_lexer(my_lexer_description,my_lexer)) | |
| 634 | - Actually, $att(my_lexer_description) is used here only for retrieving the actions. The automaton | |
| 635 | - is contained into $att(my_lexer).$p | |
| 687 | + At the place in your program where you want to have your lexer: | |
| 688 | + $ecode(retrieve_lexer(lexer_description,lexer_name)) | |
| 689 | + Actually, $att(lexer_description) is used here only for retrieving the actions. The atomaton | |
| 690 | + is contained into $att(lexer_name).$p | |
| 636 | 691 | |
| 637 | 692 | The function $att(retrieve_lexer) is declared as follows: |
| 638 | 693 | $acode( |
| ... | ... | @@ -643,61 +698,22 @@ public define One |
| 643 | 698 | PrecompiledLexer automaton // the datum in the generated file |
| 644 | 699 | ). |
| 645 | 700 | ) |
| 646 | - At this moment no error can happen if the file $fname(my_lexer.anubis) did not contain | |
| 701 | + At this moment no error can happen if the file $fname(lexer_name.anubis) did not contain | |
| 647 | 702 | any error (this automatically generated file should not contain any error, anyway). |
| 648 | 703 | |
| 649 | 704 | |
| 650 | 705 | |
| 651 | 706 | |
| 652 | - | |
| 653 | - | |
| 654 | - | |
| 655 | - | |
| 656 | - | |
| 657 | - | |
| 658 | - | |
| 659 | - $section(Multistate lexers) | |
| 660 | - The LEX/FLEX software has a notion of $em(state) for lexers. This means that regular expressions are partitioned into | |
| 661 | - several (disjoint) sets, one for each $em(state), and that the lexer, when called in a given state, uses only those | |
| 662 | - regular expressions which belong to the set corresponding to this state.$p | |
| 663 | - | |
| 664 | - It is equivalent to say that the lexer is made of several $em(sublexers), one for each state, and that the $em(global | |
| 665 | - lexer) uses only one sublexer depending on its state. In this section, we describe how to create a $em(multistate) | |
| 666 | - lexer.$p | |
| 667 | - | |
| 668 | - Multistate lexers are always precompiled. Hence, their description must be known at compile time. | |
| 669 | - | |
| 670 | - | |
| 671 | - $subsection(Describing a multistate lexer) | |
| 672 | - In order to create a multistate lexer, you first describe $em(several) sublexers, one for each state. Each sublexer is of | |
| 673 | - type $att(List(LexerItem($Token,$Aux))), in other words, it is described exactly in the same way as a single state | |
| 674 | - lexer. | |
| 675 | - The values of the type parameters $att($Token) and $att($Aux) must be | |
| 676 | - the same one for all sublexers. In other words, there is a unique token type for your multistate lexer, and also a | |
| 677 | - unique auxiliary data type.$p | |
| 678 | - | |
| 679 | - In order to associate a name to a sublexer description, we need the following type: | |
| 680 | - $acode( | |
| 681 | -public type SubLexer($Token,$Aux): | |
| 682 | - sublexer (String sublexer_name, | |
| 683 | - List(LexerItem($Token,$Aux)) description). | |
| 684 | - ) | |
| 685 | - where $att(sublexer_name) is the name you want to give to the corresponding sublexer. This name must be valid as an | |
| 686 | - Anubis symbol, because it is used as such in the generated file in case you want to precompile your lexer.$p | |
| 687 | - | |
| 688 | - $define(LexerState)(0)($ LexerState) | |
| 689 | - | |
| 690 | - | |
| 691 | 707 | |
| 692 | 708 | $subsection(Precompiling a multistate lexer) |
| 693 | - Precompiling a multistate lexer is similar to precompiling a single state lexer. | |
| 694 | - To precompile a multistate lexer, use the following: | |
| 709 | + The above $att(make_lexer) function creates a multistate lexer at run time. If you want to compile a multistate lexer | |
| 710 | + at compile time, use the following: | |
| 695 | 711 | $acode( |
| 696 | 712 | public define One |
| 697 | 713 | make_precompiled_lexer |
| 698 | 714 | ( |
| 699 | 715 | String lexer_name, |
| 700 | - List(String) reads, | |
| 716 | + String name_of_initial_sublexer, | |
| 701 | 717 | List(SubLexer($Token,$Aux)) sublexers_descriptions, |
| 702 | 718 | Word8 escape_char, |
| 703 | 719 | String end_of_input |
| ... | ... | @@ -711,69 +727,47 @@ public define One |
| 711 | 727 | ( |
| 712 | 728 | String directory, |
| 713 | 729 | String lexer_name, |
| 714 | - List(String) reads, | |
| 730 | + String name_of_initial_sublexer, | |
| 715 | 731 | List(SubLexer($Token,$Aux)) sublexers_descriptions, |
| 716 | 732 | Word8 escape_char, |
| 717 | 733 | String end_of_input |
| 718 | 734 | ). |
| 719 | 735 | ) |
| 720 | - The argument $att(reads) let you give a list of Anubis source file names that will be $att(read) at | |
| 721 | - the beginning of the generated file. You need to put at least one such file name, because the generated | |
| 722 | - program looks for a type whose name is $att(LexerState_my_lexer) (again assuming that $att(my_lexer) | |
| 723 | - is the name of your lexer). This is the type of the states of the | |
| 724 | - lexer. This type must be a enumerated type with one alternative for each sublexer (in the same order), | |
| 725 | - and the name of this | |
| 726 | - alternative must be the name of the corresponding sublexer.$p | |
| 727 | - | |
| 728 | - The reason why this type is not automaticallly generated at the beginning of the generated file is to avoid | |
| 729 | - a possible circularity problem in case you also use APG. Indeed, APG defines the type of tokens, so that we | |
| 730 | - will have to put $att(my_parser.apg.dec.anubis) (assuming that the name of your parser is $att(my_parser)) | |
| 731 | - into the list $att(reads). But your parser will also need | |
| 732 | - to know the type of states of the lexer, since it must be able to switch between these states.$p | |
| 733 | - | |
| 734 | - As a consequence, this type should be defined (by hand) in another file. If you are using APG, you should | |
| 735 | - define it (as a $att(public type)) in the public preambule of your APG file, so that it appears at the | |
| 736 | - beginning of | |
| 737 | - $fname(my_parser.apg.dec.anubis).$p | |
| 738 | - | |
| 739 | - In order to actually precompile your lexer, you must execute the above function. You can do this in exactly | |
| 740 | - the same | |
| 741 | - way as for a single state lexer (see the example $ref(precompilemylexer)($att(precompile_my_lexer)) above).$p | |
| 742 | - | |
| 743 | - | |
| 736 | + In order to actually precompile your lexer, you must execute the above function. You can do this in exactly the same | |
| 737 | + way as for a single state lexer (see $ref(precompilemylexer)($att(precompile_my_lexer)) above). | |
| 744 | 738 | |
| 745 | 739 | $subsection(Plugging a precompiled multistate lexer onto a lexing stream) |
| 746 | - The file generated by the above function $att(make_precompiled_lexer) has a very short public part which | |
| 747 | - contains the declaration of a public function for creating an instance of the lexer plugged onto a given | |
| 748 | - lexing stream.$p | |
| 740 | + The file generated by the above function $att(make_precompiled_lexer) has a very short public part which contains: | |
| 741 | + $list( | |
| 742 | + $item the definition of the public type of lexer states, | |
| 743 | + $item a public function for creating an instance of the lexer plugged onto a given lexing stream. | |
| 744 | + ) | |
| 745 | + The name of the type of lexer states is $att(LexerState_<lexer name>), where $att(<lexer name>) is the | |
| 746 | + actual name of your lexer (i.e. the value of the $att(lexer_name) argument in the above function). Assuming that this name is | |
| 747 | + $att(my_lexer), the type of lexer states has name $att(LexerState_my_lexer). This is an enumarated type with one | |
| 748 | + alternative for each sublexer. The name of such an alternative is the name of the corresponding sublexer, but of | |
| 749 | + course in the form of an Anubis symbol instead of a character string.$p | |
| 749 | 750 | |
| 750 | - This public function has name $att(plug_my_lexer) (again assuming that the name of the lexer is $att(my_lexer)). This | |
| 751 | + The public function has name $att(plug_my_lexer) (again assuming that the name of the lexer is $att(my_lexer)). This | |
| 751 | 752 | function is declared as follows: |
| 752 | - $ecode(public define Maybe(One -> Result(LexicalError($Aux),$Token)) | |
| 753 | + $ecode(public define Maybe((One -> Result(LexicalError($Aux),$Token), // The actual lexer | |
| 754 | + LexerState_my_lexer -> One)) // The change of state command | |
| 753 | 755 | plug_my_lexer |
| 754 | 756 | ( |
| 755 | 757 | LexingStream ls, |
| 756 | - Var(LexerState_my_lexer) lexer_state_v, | |
| 757 | 758 | $Aux aux, |
| 758 | - List(SubLexer($Token,$Aux)) lexer_description, | |
| 759 | - $Token end_of_input | |
| 759 | + List(SubLexer($Token,$Aux)) lexer_description | |
| 760 | 760 | ).) |
| 761 | 761 | This function returns $att(failure) if the given $att(lexer_description) doesn't correspond to the precompiled lexer. |
| 762 | - Otherwise, it returns a function of type$par | |
| 763 | - $center($att(One -> Result(LexicalError($Aux),$Token))) | |
| 764 | - which is the lexer itself already plugged | |
| 765 | - onto the lexing | |
| 766 | - stream.$p | |
| 767 | - | |
| 768 | - Remark that you must provide a dynamic variable whose role is to hold the current state of the lexer. You can | |
| 769 | - transmit this variable (or a function able to assign a value to it) to your parser. If you are using APG, you | |
| 770 | - can transmit it through the $em(extra datum), so that it is available from within $em(immediate commands). | |
| 771 | - The content of this dynamic variable will of course be the initial | |
| 772 | - state of the multistate lexer. | |
| 762 | + Otherwise, it returns a pair of functions.$p | |
| 773 | 763 | |
| 764 | + The first function, of type $att(One -> Result(LexicalError($Aux),$Token)), is the lexer itself already plugged onto the lexing | |
| 765 | + stream. It returns the next token (or $att(end_of_input) or an error) at each call.$p | |
| 774 | 766 | |
| 767 | + The second one is the function you can use for changing the state of the lexer. It takes the wanted new state as its | |
| 768 | + argument. This function is normally used from within your parser, and if you are using APG, you should transmit it to | |
| 769 | + the parser through the $att(extra) datum, and use it within $em(immediate commands) (see APG documentation). | |
| 775 | 770 | |
| 776 | - | |
| 777 | 771 | $section(Useful tricks) |
| 778 | 772 | |
| 779 | 773 | $subsection(Testing if a whole string is a single token) |
| ... | ... | @@ -832,6 +826,9 @@ public define One |
| 832 | 826 | obtained at each reading of this variable. |
| 833 | 827 | |
| 834 | 828 | |
| 829 | + $subsection(Viewing the automaton) | |
| 830 | + | |
| 831 | + | |
| 835 | 832 | |
| 836 | 833 | |
| 837 | 834 | |
| ... | ... | @@ -865,7 +862,6 @@ read tools/2-4tree.anubis |
| 865 | 862 | |
| 866 | 863 | type LexerRankAction($Token,$Aux): |
| 867 | 864 | ignore(Int rk), |
| 868 | - ignore(Int rk, (ByteArray,LexingTools,$Aux) -> One), | |
| 869 | 865 | return(Int rk, (ByteArray,LexingTools,$Aux) -> Result(LexicalError($Aux),$Token)), |
| 870 | 866 | return(Int rk, ((Int,Int) -> ByteArray,Int,LexingTools,$Aux) -> Result(LexicalError($Aux),$Token)). |
| 871 | 867 | |
| ... | ... | @@ -882,7 +878,6 @@ define LexerRankAction($Token,$Aux) |
| 882 | 878 | if a is |
| 883 | 879 | { |
| 884 | 880 | ignore then ignore(rank), |
| 885 | - ignore(f) then ignore(rank,f), | |
| 886 | 881 | return(f) then return(rank,f), |
| 887 | 882 | return(f) then return(rank,f) |
| 888 | 883 | }. |
| ... | ... | @@ -2462,20 +2457,14 @@ public type DFA_state($Token,$Aux): |
| 2462 | 2457 | List(DFA_transition) transitions, |
| 2463 | 2458 | Int action_rank, |
| 2464 | 2459 | (ByteArray,LexingTools,$Aux) |
| 2465 | - -> Result(LexicalError($Aux),$Token) action), | |
| 2466 | - | |
| 2467 | - ignoring (Word16 name, | |
| 2468 | - List(DFA_transition) transitions, | |
| 2469 | - Int action_rank, | |
| 2470 | - (ByteArray,LexingTools,$Aux) | |
| 2471 | - -> One action), | |
| 2460 | + -> Result(LexicalError($Aux),$Token) action), | |
| 2472 | 2461 | |
| 2473 | 2462 | accepting (Word16 name, |
| 2474 | 2463 | List(DFA_transition) transitions, |
| 2475 | 2464 | Int action_rank, |
| 2476 | 2465 | ((Int,Int) |
| 2477 | 2466 | -> ByteArray,Int,LexingTools,$Aux) |
| 2478 | - -> Result(LexicalError($Aux),$Token) action), | |
| 2467 | + -> Result(LexicalError($Aux),$Token) action), | |
| 2479 | 2468 | |
| 2480 | 2469 | ignoring (Word16 name, |
| 2481 | 2470 | List(DFA_transition) transitions). |
| ... | ... | @@ -2586,8 +2575,6 @@ define List(DFA_state($Token,$Aux)) |
| 2586 | 2575 | { |
| 2587 | 2576 | ignore(rk) then // a state has a 'ignore' if and only if it is 'ignoring' |
| 2588 | 2577 | [ignoring(get_new_name(old_name,nlist),trs) . rename(t,nlist)], |
| 2589 | - ignore(rk,a) then | |
| 2590 | - [ignoring(get_new_name(old_name,nlist),trs,rk,a) . rename(t,nlist)], | |
| 2591 | 2578 | return(rk,a) then |
| 2592 | 2579 | [accepting(get_new_name(old_name,nlist),trs,rk,a) . rename(t,nlist)], |
| 2593 | 2580 | return(rk,a) then |
| ... | ... | @@ -2705,9 +2692,6 @@ public define List(FastLexerState) |
| 2705 | 2692 | { |
| 2706 | 2693 | rejecting(n,trs) then rejecting(to_fast_lexer_transitions(trs)) |
| 2707 | 2694 | accepting(n,trs,rk,a) then accepting(to_fast_lexer_transitions(trs)) |
| 2708 | - // ignoring with action translates to 'accepting' in the low level lexer, because | |
| 2709 | - // the low level lexer must return so that the action is performed. | |
| 2710 | - ignoring (n,trs,rk,a) then accepting(to_fast_lexer_transitions(trs)) | |
| 2711 | 2695 | accepting(n,trs,rk,a) then accepting(to_fast_lexer_transitions(trs)) |
| 2712 | 2696 | ignoring (n,trs) then ignoring(to_fast_lexer_transitions(trs)) |
| 2713 | 2697 | } . to_fast_lexer_description(t)] |
| ... | ... | @@ -2752,9 +2736,8 @@ define TreeKV(Word16,LexerAction($Token,$Aux)) |
| 2752 | 2736 | { |
| 2753 | 2737 | rejecting(name,trs) then tree, |
| 2754 | 2738 | accepting(name,trs,rank,action) then insert(name,return(action),tree), |
| 2755 | - ignoring (name,trs,rank,action) then insert(name,ignore(action),tree), | |
| 2756 | 2739 | accepting(name,trs,rank,action) then insert(name,return(action),tree), |
| 2757 | - ignoring (name,trs) then tree | |
| 2740 | + ignoring(name,trs) then tree | |
| 2758 | 2741 | }, |
| 2759 | 2742 | fill_actions(t,next_tree) |
| 2760 | 2743 | }. |
| ... | ... | @@ -2858,14 +2841,6 @@ public define TreeKV(Word16,LexerAction($Token,$Aux)) |
| 2858 | 2841 | |
| 2859 | 2842 | |
| 2860 | 2843 | *** [7.3] Reading the next token. |
| 2861 | - | |
| 2862 | - What the lexer returns: | |
| 2863 | - | |
| 2864 | -public type LexerResult($Token,$Aux): | |
| 2865 | - ignore, // something to be ignored (this happens because of ignore+action) | |
| 2866 | - error (LexicalError($Aux)), // a lexical error | |
| 2867 | - token ($Token). // a token to be returned | |
| 2868 | - | |
| 2869 | 2844 | |
| 2870 | 2845 | A special debugging macro for the function 'read_next_token'. |
| 2871 | 2846 | |
| ... | ... | @@ -2874,7 +2849,8 @@ define macro One debug_rnt(String s) = unique. |
| 2874 | 2849 | |
| 2875 | 2850 | The function which reads the next token: |
| 2876 | 2851 | |
| 2877 | -public define LexerResult($Token,$Aux) | |
| 2852 | +public define Result(LexicalError($Aux), // can return a lexical error | |
| 2853 | + $Token) // or a token (which can be 'end_of_input') | |
| 2878 | 2854 | read_next_token |
| 2879 | 2855 | ( |
| 2880 | 2856 | (ByteArray,FastLexerLastAccepted,Int,Int,Word16) -> FastLexerOutput low_level_lexer, |
| ... | ... | @@ -3009,7 +2985,7 @@ public define LexerResult($Token,$Aux) |
| 3009 | 2985 | current_v <- end; |
| 3010 | 2986 | last_accept_v <- none; |
| 3011 | 2987 | if start >= end |
| 3012 | - then debug_rnt("case (2a1)"); token(end_of_input) | |
| 2988 | + then debug_rnt("case (2a1)"); ok(end_of_input) | |
| 3013 | 2989 | else debug_rnt("case (2a2)"); error(lex_error(extract(*buffer_v,start,end),tools,aux)), |
| 3014 | 2990 | |
| 3015 | 2991 | success(_) then /*** Case (2b) ***/ |
| ... | ... | @@ -3044,33 +3020,21 @@ public define LexerResult($Token,$Aux) |
| 3044 | 3020 | last_accept_v <- none; |
| 3045 | 3021 | if get(s,actions) is |
| 3046 | 3022 | { |
| 3047 | - failure then should_not_happen(token(end_of_input)), | |
| 3023 | + failure then should_not_happen(ok(end_of_input)), | |
| 3048 | 3024 | success(ac) then if ac is |
| 3049 | 3025 | { |
| 3050 | 3026 | ignore then /* this should not happen */ |
| 3051 | 3027 | show(lstream); |
| 3052 | 3028 | should_not_happen((String file, Word32 line) |-> |
| 3053 | 3029 | print("In '"+file+"' at line "+to_decimal(line)+ |
| 3054 | - ": no action found for accepting state "+to_decimal(s)+".\n"), | |
| 3055 | - token(end_of_input)), | |
| 3056 | - | |
| 3057 | - ignore(f) then f(extract(*buffer_v,start,end),tools,aux); | |
| 3058 | - debug_rnt("ignore with action and restart: "+to_string(extract(*buffer_v,start,end))); | |
| 3059 | - ignore, | |
| 3060 | - | |
| 3030 | + ": no action found for accepting state "+to_decimal(s)+".\n"), ok(end_of_input)), | |
| 3061 | 3031 | return(f) then |
| 3062 | - if f(extract(*buffer_v,start,end),tools,aux) is | |
| 3063 | - { | |
| 3064 | - error(e) then error(e), | |
| 3065 | - ok(tok) then token(tok) | |
| 3066 | - }, | |
| 3032 | + with result = f(extract(*buffer_v,start,end),tools,aux), | |
| 3033 | + result, | |
| 3067 | 3034 | |
| 3068 | 3035 | return(f) then |
| 3069 | - if f((Int k, Int l) |-> extract(*buffer_v,start+k,start+l),end-start,tools,aux) is | |
| 3070 | - { | |
| 3071 | - error(e) then error(e), | |
| 3072 | - ok(tok) then token(tok) | |
| 3073 | - } | |
| 3036 | + with result = f((Int k, Int l) |-> extract(*buffer_v,start+k,start+l),end-start,tools,aux), | |
| 3037 | + result | |
| 3074 | 3038 | } |
| 3075 | 3039 | } |
| 3076 | 3040 | ) |
| ... | ... | @@ -3115,26 +3079,17 @@ public define LexerResult($Token,$Aux) |
| 3115 | 3079 | last_accept_v <- none; |
| 3116 | 3080 | if get(s,actions) is |
| 3117 | 3081 | { |
| 3118 | - failure then should_not_happen(token(end_of_input)), | |
| 3082 | + failure then should_not_happen(ok(end_of_input)), | |
| 3119 | 3083 | success(ac) then if ac is |
| 3120 | 3084 | { |
| 3121 | - ignore then should_not_happen(token(end_of_input)), | |
| 3122 | - ignore(f) then f(extract(*buffer_v,start,end),tools,aux); // execute the 'ignoring' action | |
| 3123 | - debug_rnt("ignore with action and no restart"); | |
| 3124 | - token(end_of_input), | |
| 3085 | + ignore then should_not_happen(ok(end_of_input)), | |
| 3125 | 3086 | return(f) then |
| 3126 | - if f(extract(*buffer_v,start,end),tools,aux) is | |
| 3127 | - { | |
| 3128 | - error(e) then error(e), | |
| 3129 | - ok(tok) then token(tok) | |
| 3130 | - }, | |
| 3087 | + with result = f(extract(*buffer_v,start,end),tools,aux), | |
| 3088 | + result, | |
| 3131 | 3089 | return(f) then |
| 3132 | - if f((Int k, Int l) |-> extract(*buffer_v,start+k,start+l), | |
| 3133 | - end-start,tools,aux) is | |
| 3134 | - { | |
| 3135 | - error(e) then error(e), | |
| 3136 | - ok(tok) then token(tok) | |
| 3137 | - } | |
| 3090 | + with result = f((Int k, Int l) |-> extract(*buffer_v,start+k,start+l), | |
| 3091 | + end-start,tools,aux), | |
| 3092 | + result | |
| 3138 | 3093 | } |
| 3139 | 3094 | }, |
| 3140 | 3095 | |
| ... | ... | @@ -3178,7 +3133,7 @@ public define LexerResult($Token,$Aux) |
| 3178 | 3133 | debug_rnt("buffer cannot be reloaded (case (5a))"); |
| 3179 | 3134 | current_v <- lgbuf; |
| 3180 | 3135 | last_accept_v <- none; |
| 3181 | - token(end_of_input), | |
| 3136 | + ok(end_of_input), | |
| 3182 | 3137 | |
| 3183 | 3138 | success(_) then |
| 3184 | 3139 | /* Warning: after reloading of the buffer 'lgbuf' is invalid. */ |
| ... | ... | @@ -3197,10 +3152,8 @@ public define LexerResult($Token,$Aux) |
| 3197 | 3152 | |
| 3198 | 3153 | |
| 3199 | 3154 | *** [7.4] Plugging a low level lexer onto a lexing stream. |
| 3200 | - | |
| 3201 | - | |
| 3202 | 3155 | |
| 3203 | -define One -> LexerResult($Token,$Aux) | |
| 3156 | +define One -> Result(LexicalError($Aux),$Token) | |
| 3204 | 3157 | plug_lexer |
| 3205 | 3158 | ( |
| 3206 | 3159 | LexingStream stream, |
| ... | ... | @@ -3219,7 +3172,7 @@ define One -> LexerResult($Token,$Aux) |
| 3219 | 3172 | |
| 3220 | 3173 | *** [7.5] Making a single state lexer at run time. |
| 3221 | 3174 | |
| 3222 | -public define Result(RegExprError, (LexingStream,$Aux)-> One -> LexerResult($Token,$Aux)) | |
| 3175 | +public define Result(RegExprError, (LexingStream,$Aux)-> One -> Result(LexicalError($Aux),$Token)) | |
| 3223 | 3176 | make_lexer |
| 3224 | 3177 | ( |
| 3225 | 3178 | List(LexerRankItem($Token,$Aux)) lexer_description, |
| ... | ... | @@ -3322,14 +3275,8 @@ define One |
| 3322 | 3275 | [ ] then if (i&15) = 15 then unique else print(f,"\n"), |
| 3323 | 3276 | [h . t] then if h is transition(label,target_name) then |
| 3324 | 3277 | if label is char(c) then |
| 3325 | - (if ' ' +=< c & c +=< '}' // don't print non utf-8 characters | |
| 3326 | - then | |
| 3327 | - ( | |
| 3328 | - print(f," "+implode([c])+"->"+to_decimal(target_name)); | |
| 3329 | - (if (i&15) = 15 then print(f,"\n") else unique) | |
| 3330 | - ) | |
| 3331 | - else unique | |
| 3332 | - ); | |
| 3278 | + print(f," '"+implode([c])+"'>"+to_decimal(target_name)); | |
| 3279 | + (if (i&15) = 15 then print(f,"\n") else unique); | |
| 3333 | 3280 | dump(f,t,i+1) |
| 3334 | 3281 | }. |
| 3335 | 3282 | |
| ... | ... | @@ -3352,10 +3299,6 @@ define One |
| 3352 | 3299 | accepting(name,transitions,action_rank,action) then |
| 3353 | 3300 | print(f,"\n --- state "+to_decimal(name)+" (accepting with action number "+to_decimal(action_rank)+") ---\n"); |
| 3354 | 3301 | dump(f,transitions,0), |
| 3355 | - | |
| 3356 | - ignoring(name,transitions,action_rank,action) then | |
| 3357 | - print(f,"\n --- state "+to_decimal(name)+" (ignoring with action number "+to_decimal(action_rank)+") ---\n"); | |
| 3358 | - dump(f,transitions,0), | |
| 3359 | 3302 | |
| 3360 | 3303 | accepting(name,transitions,action_rank,action) then |
| 3361 | 3304 | print(f,"\n --- state "+to_decimal(name)+" (accepting with action number "+to_decimal(action_rank)+") ---\n"); |
| ... | ... | @@ -3383,7 +3326,7 @@ define One |
| 3383 | 3326 | if l is precompiled_fast_lexer(fba,sba) then |
| 3384 | 3327 | print(f,"\n *** '"+lexer_name+"'.\n"); |
| 3385 | 3328 | print(f,"\n This (deterministic) automaton has "+to_decimal(length(actions_ranks))+" states.\n"); |
| 3386 | - dump(f,dfa); | |
| 3329 | + //dump(f,dfa); | |
| 3387 | 3330 | print(f,"\ndefine (List(Int),PrecompiledFastLexer)\n"); |
| 3388 | 3331 | print(f," "+lexer_name+" =\n"); |
| 3389 | 3332 | print(f," // The list below gives the action associated to each state:\n"+ |
| ... | ... | @@ -3424,7 +3367,6 @@ define List(Int) |
| 3424 | 3367 | { |
| 3425 | 3368 | rejecting(name,transitions) then [-1 . actions_ranks(t)], |
| 3426 | 3369 | accepting(name,transitions,rank,action) then [rank . actions_ranks(t)], |
| 3427 | - ignoring (name,transitions,rank,action) then [rank . actions_ranks(t)], | |
| 3428 | 3370 | accepting(name,transitions,rank,action) then [rank . actions_ranks(t)], |
| 3429 | 3371 | ignoring(name,transitions) then [-1 . actions_ranks(t)] |
| 3430 | 3372 | } |
| ... | ... | @@ -3448,7 +3390,6 @@ define List(ByteArray) |
| 3448 | 3390 | with asign = (LexerAction($Token,$Aux) a) |-> if a is |
| 3449 | 3391 | { |
| 3450 | 3392 | ignore then "(*i)", // something which is illegal as a regular expression |
| 3451 | - ignore(ac) then "(*ia)", | |
| 3452 | 3393 | return(ac) then "(*r1)", |
| 3453 | 3394 | return(ac) then "(*r2)" |
| 3454 | 3395 | }, |
| ... | ... | @@ -3567,7 +3508,7 @@ public define One |
| 3567 | 3508 | |
| 3568 | 3509 | |
| 3569 | 3510 | |
| 3570 | -public define (LexingStream,$Aux) -> One -> LexerResult($Token,$Aux) | |
| 3511 | +public define (LexingStream,$Aux) -> One -> Result(LexicalError($Aux),$Token) | |
| 3571 | 3512 | retrieve_lexer |
| 3572 | 3513 | ( |
| 3573 | 3514 | List(LexerItem($Token,$Aux)) lexer_description, |
| ... | ... | @@ -3588,7 +3529,6 @@ public define (LexingStream,$Aux) -> One -> LexerResult($Token,$Aux) |
| 3588 | 3529 | |
| 3589 | 3530 | |
| 3590 | 3531 | |
| 3591 | - | |
| 3592 | 3532 | *** [8.2] Dumping the definitions of sublexers. |
| 3593 | 3533 | |
| 3594 | 3534 | For each sublexer we dump a definition of its precompiled form. |
| ... | ... | @@ -3651,12 +3591,7 @@ define One |
| 3651 | 3591 | { |
| 3652 | 3592 | [ ] then unique, |
| 3653 | 3593 | [h . t] then if h is sublexer(name,_) then |
| 3654 | - print(target," "+pad(name,20)+" then if subl_"+name+"(unique) is\n"); | |
| 3655 | - print(target," {\n"); | |
| 3656 | - print(target," ignore then lex(unique),\n"); | |
| 3657 | - print(target," error(e) then error(e),\n"); | |
| 3658 | - print(target," token(tok) then ok(tok)\n"); | |
| 3659 | - print(target," }"); | |
| 3594 | + print(target," "+pad(name,20)+" then subl_"+name+"(unique)"); | |
| 3660 | 3595 | print(target,if t is [] then "\n" else ",\n"); |
| 3661 | 3596 | dump_switching_cases(target,lexer_name,t) |
| 3662 | 3597 | }. |
| ... | ... | @@ -3702,18 +3637,19 @@ define One |
| 3702 | 3637 | ( |
| 3703 | 3638 | WStream target, |
| 3704 | 3639 | String lexer_name, |
| 3640 | + String initial_state_name, | |
| 3705 | 3641 | List(SubLexer($Token,$Aux)) sublexers |
| 3706 | 3642 | ) = |
| 3707 | 3643 | print(target,"\nread lexical_analysis/fast_lexer_5.anubis\n"); |
| 3708 | 3644 | print(target,"\n The function below creates an instance of your lexer from a lexing stream.\n"); |
| 3709 | - print(target,"\n You must provide the dynamic variable containing the state of the lexer."); | |
| 3710 | - print(target,"\n You have to provide the auxiliary datum."); | |
| 3645 | + print(target," It also provides the function for switching the states of the lexer.\n"); | |
| 3646 | + print(target,"\n You have to provide the auxiliary datum.\n"); | |
| 3711 | 3647 | print(target,"\n The argument 'description' is needed for recovering the actions.\n"); |
| 3712 | - print(target,"\npublic define Maybe(One -> Result(LexicalError($Aux),$Token))\n"); | |
| 3648 | + print(target,"\n"+pad("public define Maybe((One -> Result(LexicalError($Aux),$Token),",70)+" // The actual lexer\n"); | |
| 3649 | + print(target,pad(" LexerState_"+lexer_name+" -> One))",70)+" // The change of state command\n"); | |
| 3713 | 3650 | print(target," plug_"+lexer_name+"\n"); |
| 3714 | 3651 | print(target," (\n"); |
| 3715 | 3652 | print(target," LexingStream ls,\n"); |
| 3716 | - print(target," Var(LexerState_"+lexer_name+") lexer_state_v,\n"); | |
| 3717 | 3653 | print(target," $Aux aux,\n"); |
| 3718 | 3654 | print(target," List(SubLexer($Token,$Aux)) description,\n"); |
| 3719 | 3655 | print(target," $Token end_of_input\n"); |
| ... | ... | @@ -3726,14 +3662,15 @@ define One |
| 3726 | 3662 | WStream target, |
| 3727 | 3663 | String signature, |
| 3728 | 3664 | String lexer_name, |
| 3665 | + String initial_state_name, | |
| 3729 | 3666 | List(SubLexer($Token,$Aux)) sublexers, |
| 3730 | 3667 | String end_of_input |
| 3731 | 3668 | ) = |
| 3732 | - print(target,"\npublic define Maybe(One -> Result(LexicalError($Aux),$Token))\n"); | |
| 3669 | + print(target,"\npublic define Maybe((One -> Result(LexicalError($Aux),$Token),\n"); | |
| 3670 | + print(target," LexerState_"+lexer_name+" -> One))\n"); | |
| 3733 | 3671 | print(target," plug_"+lexer_name+"\n"); |
| 3734 | 3672 | print(target," (\n"); |
| 3735 | 3673 | print(target," LexingStream ls,\n"); |
| 3736 | - print(target," Var(LexerState_"+lexer_name+") lexer_state_v,\n"); | |
| 3737 | 3674 | print(target," $Aux aux,\n"); |
| 3738 | 3675 | print(target," List(SubLexer($Token,$Aux)) description,\n"); |
| 3739 | 3676 | print(target," $Token end_of_input\n"); |
| ... | ... | @@ -3742,11 +3679,14 @@ define One |
| 3742 | 3679 | concat(map(sublexer_name,sublexers),"\",\"")+"\"],description)\n"); |
| 3743 | 3680 | print(target," then failure else\n"); |
| 3744 | 3681 | dump_plug_sublexers(target,lexer_name,sublexers,end_of_input); |
| 3745 | - print(target," success(\n"); | |
| 3746 | - print(target," (One u) |-lex-> if *lexer_state_v is\n"); | |
| 3682 | + print(target," with sv = var((LexerState_"+lexer_name+")"+initial_state_name+"),\n"); | |
| 3683 | + print(target," success((\n"); | |
| 3684 | + print(target," (One u) |-> if *sv is\n"); | |
| 3747 | 3685 | print(target," {\n"); |
| 3748 | 3686 | dump_switching_cases(target,lexer_name,sublexers); |
| 3749 | - print(target," }).\n\n"). | |
| 3687 | + print(target," },\n"); | |
| 3688 | + print(target," (LexerState_"+lexer_name+" st) |-> sv <- st\n"); | |
| 3689 | + print(target," )).\n\n"). | |
| 3750 | 3690 | |
| 3751 | 3691 | |
| 3752 | 3692 | public define Bool |
| ... | ... | @@ -3773,6 +3713,7 @@ public define One |
| 3773 | 3713 | String directory, |
| 3774 | 3714 | String lexer_name, |
| 3775 | 3715 | List(String) reads, |
| 3716 | + String initial_state_name, | |
| 3776 | 3717 | List(SubLexer($Token,$Aux)) sublexers, |
| 3777 | 3718 | Word8 escape_char, |
| 3778 | 3719 | String end_of_input |
| ... | ... | @@ -3799,7 +3740,7 @@ public define One |
| 3799 | 3740 | print(target,"\n"); |
| 3800 | 3741 | |
| 3801 | 3742 | // lexer making function declaration. |
| 3802 | - dump_switching_function_dec(target,lexer_name,sublexers); | |
| 3743 | + dump_switching_function_dec(target,lexer_name,initial_state_name,sublexers); | |
| 3803 | 3744 | |
| 3804 | 3745 | print(target,"\n --- Thats all for the public part ! --------------------------------------------------\n\n"); |
| 3805 | 3746 | |
| ... | ... | @@ -3808,7 +3749,7 @@ public define One |
| 3808 | 3749 | |
| 3809 | 3750 | // the switching function. |
| 3810 | 3751 | print(target,"\nread tools/bool.anubis\n\n"); |
| 3811 | - dump_switching_function(target,signature,lexer_name,sublexers,end_of_input); | |
| 3752 | + dump_switching_function(target,signature,lexer_name,initial_state_name,sublexers,end_of_input); | |
| 3812 | 3753 | print("Done.\n"); forget(flush(stdout)) |
| 3813 | 3754 | }, |
| 3814 | 3755 | if read_signature(path) is |
| ... | ... | @@ -3827,11 +3768,12 @@ public define One |
| 3827 | 3768 | ( |
| 3828 | 3769 | String lexer_name, |
| 3829 | 3770 | List(String) reads, |
| 3771 | + String initial_state_name, | |
| 3830 | 3772 | List(SubLexer($Token,$Aux)) sublexers, |
| 3831 | 3773 | Word8 escape_char, |
| 3832 | 3774 | String end_of_input |
| 3833 | 3775 | ) = |
| 3834 | - make_precompiled_lexer("generated",lexer_name,reads,sublexers,escape_char,end_of_input). | |
| 3776 | + make_precompiled_lexer("generated",lexer_name,reads,initial_state_name,sublexers,escape_char,end_of_input). | |
| 3835 | 3777 | |
| 3836 | 3778 | |
| 3837 | 3779 | ... | ... |
anubis_dev/library/predefined.anubis
| ... | ... | @@ -994,16 +994,9 @@ public define ByteArray extract(ByteArray s, Int start, Int end). |
| 994 | 994 | shorter than 'end - start', if 'end' and/or 'start' are out of bounds. In particular, |
| 995 | 995 | it may be the empty byte array. There is no side effect. |
| 996 | 996 | |
| 997 | - The function below writes a byte array 'src' into a byte array 'dest' at a given position. | |
| 998 | - | |
| 999 | -public define One write(ByteArray scr, ByteArray dest, Int position). | |
| 1000 | - | |
| 1001 | - If some bytes of 'src' are to be written outside 'dest' (on one side or the other one), | |
| 1002 | - they are simply not written. Of course, 'write' produces a side effet on 'dest'. | |
| 1003 | - | |
| 1004 | 997 | public define ByteArray ByteArray s + ByteArray t. |
| 1005 | 998 | |
| 1006 | - This concatenates byte arrays. | |
| 999 | + This concatenates byte arrays. | |
| 1007 | 1000 | |
| 1008 | 1001 | |
| 1009 | 1002 | The following two primitives transform a byte array into a string. The first one |
| ... | ... | @@ -1998,7 +1991,7 @@ public type SocketLinger: |
| 1998 | 1991 | |
| 1999 | 1992 | public type SocketOption: |
| 2000 | 1993 | // Socket level |
| 2001 | - so_broadcast(Bool), // Allows transmission of broadcast messages on the socket. | |
| 1994 | + so_broadcast(Bool), // Allows transmission of broadcast messages on the socket. | |
| 2002 | 1995 | so_debug(Bool), // Records debugging information. |
| 2003 | 1996 | so_dontroute(Bool), // Does not route: sends directly to interface. |
| 2004 | 1997 | // Not supported on ATM sockets (results in an error). |
| ... | ... | @@ -2124,7 +2117,7 @@ public type PacketSocketProtocol: |
| 2124 | 2117 | * Non DIX types. Won't clash for 1500 types. |
| 2125 | 2118 | */ |
| 2126 | 2119 | eth_p_802_3, /* Dummy type for 802.3 frames */ |
| 2127 | - eth_p_ax25, /* Dummy protocol id for AX.25 */ | |
| 2120 | + eth_p_ax25, /* Dummy protocol id for AX.25 */ | |
| 2128 | 2121 | eth_h_802_2, /* 802.2 frames */ |
| 2129 | 2122 | eth_h_tr_802_2. /* 802.2 frames */ |
| 2130 | 2123 | ... | ... |
anubis_dev/vm/src/syscall.cpp
| ... | ... | @@ -5902,60 +5902,6 @@ Returns: ExecuteStatus (see predefined.anubis) |
| 5902 | 5902 | MAM(m_IP) += 1+2; |
| 5903 | 5903 | return; |
| 5904 | 5904 | |
| 5905 | - | |
| 5906 | - /* Expects: | |
| 5907 | - - at *(MAM(m_SP)-1): ByteArray src | |
| 5908 | - - at *(MAM(m_SP)-2): ByteArray dest | |
| 5909 | - - at *(MAM(m_SP)-3): Int where | |
| 5910 | - | |
| 5911 | - Returns (One)unique. | |
| 5912 | - */ | |
| 5913 | -syscall_case(write_byte_array) | |
| 5914 | - { | |
| 5915 | - U8 *src = (U8 *)(*(MAM(m_SP)-1)) + 8; | |
| 5916 | - U8 *dest = (U8 *)(*(MAM(m_SP)-2)) + 8; | |
| 5917 | - U32 where = *(MAM(m_SP)-3); | |
| 5918 | - U32 src_length = ((U32 *)(*(MAM(m_SP)-1)))[1]; | |
| 5919 | - U32 dest_length = ((U32 *)(*(MAM(m_SP)-2)))[1]; | |
| 5920 | - U32 i; | |
| 5921 | - | |
| 5922 | - /* get the sign of where (which can be negative) */ | |
| 5923 | - int where_sign = +1; | |
| 5924 | - if (where&2) where_sign = -1; | |
| 5925 | - | |
| 5926 | - /* The result is always (One)unique */ | |
| 5927 | - MAM(m_R) = 0; | |
| 5928 | - | |
| 5929 | - /* compute absolute value of 'where' as an U32 */ | |
| 5930 | - if (where&1) | |
| 5931 | - where = where>>2; | |
| 5932 | - else | |
| 5933 | - where = ((U32 *)where)[2]; | |
| 5934 | - | |
| 5935 | - /* if 'where' is negative: */ | |
| 5936 | - if (where_sign == -1) | |
| 5937 | - { | |
| 5938 | - if (where >= src_length) | |
| 5939 | - { /* all of 'src' is on the left of 'dest' */ | |
| 5940 | - MAM(m_IP) += 1+2; | |
| 5941 | - return; | |
| 5942 | - } | |
| 5943 | - else | |
| 5944 | - { /* write only those bytes of 'src' which fall into 'dest' */ | |
| 5945 | - src += where; | |
| 5946 | - src_length -= where; | |
| 5947 | - where = 0; | |
| 5948 | - } | |
| 5949 | - } | |
| 5950 | - for (i = 0; i < src_length && where+i < dest_length; i++) | |
| 5951 | - { | |
| 5952 | - dest[where+i] = src[i]; | |
| 5953 | - } | |
| 5954 | - /* length of 'dest' is unchanged. */ | |
| 5955 | - } | |
| 5956 | - MAM(m_IP) += 1+2; | |
| 5957 | - return; | |
| 5958 | - | |
| 5959 | 5905 | |
| 5960 | 5906 | syscall_case(virtual_machine_id) |
| 5961 | 5907 | { | ... | ... |