$include_dir="/home/hyper-archives/boost/include"; include("$include_dir/msg-header.inc") ?>
From: Daniel Berlin (dan_at_[hidden])
Date: 2001-06-08 23:01:22
John Max Skaller <skaller_at_[hidden]> writes:
> joel de guzman wrote:
> 
>> 1. Why is compilation times bounded by lexical analysis?
> 
> 	Because finding a lexeme requires scanning characters.
> There are lots of them. Even a very fast lexer has some
> per character overhead. 
> 
>> 2. What are the performance bottle-necks typically encountered?
> 
> 	Old fashioned lexers use buffers. It is very expensive
> to test for end of buffer. On modern equipment, you can just
> load the whole file into memory and add a 'null' at the end,
> this speeds up advancing to the next character a huge amount.
> 
> 	Still, you have to do a lookup into a rather large
> transition table. That typically guarrantees a physical
> memory access (clobbers the cache). So a 100Mhz processor
> is suddenly running at 10Mhz.
> 
> 	For example, the Felix lexer:
> 
> 	ocamllex.opt src/flx_lex.mll
> 	208 states, 3174 transitions, table size 13944 bytes
> 
> Obviously, 13K is bigger than the primary processor cache.
Try a scanner generator that generates directly executable
code, like re2c.
It's often *much* smaller, and *much* faster, than flex.
And it's the same amount of actual effort.
> 
>> 3. What can be done about it?
> 
> 	Often, a hand written lexer will be faster than
> anything else.
Same answer.
Just because flex is slow, doesn't mean everything else is.
You can make scanner generators that generate directly executable
scanners (IE output the real C code, rather than tables and an
interpreter), and you aren't going to make a faster hand coded one,
unless you know something you can't tell the scanner generator that
helps you.
This is rare.
From the following scanner specification:
(It's a severely hacked up re2c example, to avoid being too large in
lines of code to post (it compiles to something a lot smaller, obviously), so
excuse the style. I din't code it, i just shortened it)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define	EOI	319
typedef unsigned int uint;
typedef unsigned char uchar;
#define	BSIZE	8192
#define	YYCTYPE		uchar
#define	YYCURSOR	cursor
#define	YYLIMIT		s->lim
#define	YYMARKER	s->ptr
#define	YYFILL(n)	{cursor = fill(s, cursor);}
#define	RET(i)	{s->cur = cursor; return i;}
typedef struct Scanner {
    int			fd;
    uchar		*bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint		line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}
int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
/*!re2c
any	= [\000-\377];
O	= [0-7];
D	= [0-9];
L	= [a-zA-Z_];
H	= [a-fA-F0-9];
E	= [Ee] [+-]? D+;
FS	= [fFlL];
IS	= [uUlL]*;
ESC	= [\\] ([abfnrtv?'"\\] | "x" H+ | O+);
*/
/*!re2c
        
        L (L|D)*		{ RET(ID); }
        
        D+			{ RET(NUM); }
        [ \t\v\f]+		{ goto std; }
        "\n"
            {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }
        any
            {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
*/
}
main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}
From that, you get, with the -s option, to use binary searches instead
of direct switch statements most of the time, you get:
/* Generated by re2c 0.9.1 on Fri Jun  8 23:53:33 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define	EOI	319
typedef unsigned int uint;
typedef unsigned char uchar;
#define	BSIZE	8192
#define	YYCTYPE		uchar
#define	YYCURSOR	cursor
#define	YYLIMIT		s->lim
#define	YYMARKER	s->ptr
#define	YYFILL(n)	{cursor = fill(s, cursor);}
#define	RET(i)	{s->cur = cursor; return i;}
typedef struct Scanner {
    int			fd;
    uchar		*bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint		line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}
int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
#line 75
{
        YYCTYPE yych;
        unsigned int yyaccept;
        goto yy0;
yy1:	++YYCURSOR;
yy0:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        if(yych <= '/'){
                if(yych <= '\n'){
                        if(yych <= '\b')	goto yy10;
                        if(yych <= '\t')	goto yy6;
                        goto yy8;
                } else {
                        if(yych <= '\f')	goto yy6;
                        if(yych == ' ')	goto yy6;
                        goto yy10;
                }
        } else {
                if(yych <= '^'){
                        if(yych <= '9')	goto yy4;
                        if(yych <= '@')	goto yy10;
                        if(yych >= '[')	goto yy10;
                } else {
                        if(yych == '`')	goto yy10;
                        if(yych >= '{')	goto yy10;
                }
        }
yy2:	yych = *++YYCURSOR;
        goto yy17;
yy3:
#line 79
        { RET(ID); }
yy4:	yych = *++YYCURSOR;
        goto yy15;
yy5:
#line 81
        { RET(NUM); }
yy6:	yych = *++YYCURSOR;
        goto yy13;
yy7:
#line 82
        { goto std; }
yy8:	yych = *++YYCURSOR;
yy9:
#line 85
        {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }
yy10:	yych = *++YYCURSOR;
yy11:
#line 92
        {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
yy12:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy13:	if(yych <= '\n'){
                if(yych == '\t')	goto yy12;
                goto yy7;
        } else {
                if(yych <= '\f')	goto yy12;
                if(yych == ' ')	goto yy12;
                goto yy7;
        }
yy14:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy15:	if(yych <= '/')	goto yy5;
        if(yych <= '9')	goto yy14;
        goto yy5;
yy16:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy17:	if(yych <= 'Z'){
                if(yych <= '/')	goto yy3;
                if(yych <= '9')	goto yy16;
                if(yych <= '@')	goto yy3;
                goto yy16;
        } else {
                if(yych <= '_'){
                        if(yych <= '^')	goto yy3;
                        goto yy16;
                } else {
                        if(yych <= '`')	goto yy3;
                        if(yych <= 'z')	goto yy16;
                        goto yy3;
                }
        }
}
#line 96
}
main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}
Without the -s option, you get:
/* Generated by re2c 0.5 on Fri Jun  8 23:53:39 2001 */
#line 1 "test.re"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define COLONCOLON 318
#define COLON 317
#define IDENT 316
#define NUM 315
#define	EOI	319
typedef unsigned int uint;
typedef unsigned char uchar;
#define	BSIZE	8192
#define	YYCTYPE		uchar
#define	YYCURSOR	cursor
#define	YYLIMIT		s->lim
#define	YYMARKER	s->ptr
#define	YYFILL(n)	{cursor = fill(s, cursor);}
#define	RET(i)	{s->cur = cursor; return i;}
typedef struct Scanner {
    int			fd;
    uchar		*bot, *tok, *ptr, *cur, *pos, *lim, *top, *eof;
    uint		line;
} Scanner;
uchar *fill(Scanner *s, uchar *cursor){
    if(!s->eof){
        uint cnt = s->tok - s->bot;
        if(cnt){
            memcpy(s->bot, s->tok, s->lim - s->tok);
            s->tok = s->bot;
            s->ptr -= cnt;
            cursor -= cnt;
            s->pos -= cnt;
            s->lim -= cnt;
        }
        if((s->top - s->lim) < BSIZE){
            uchar *buf = (uchar*) malloc(((s->lim - s->bot) + BSIZE)*sizeof(uchar));
            memcpy(buf, s->tok, s->lim - s->tok);
            s->tok = buf;
            s->ptr = &buf[s->ptr - s->bot];
            cursor = &buf[cursor - s->bot];
            s->pos = &buf[s->pos - s->bot];
            s->lim = &buf[s->lim - s->bot];
            s->top = &s->lim[BSIZE];
            free(s->bot);
            s->bot = buf;
        }
        if((cnt = read(s->fd, (char*) s->lim, BSIZE)) != BSIZE){
            s->eof = &s->lim[cnt]; *(s->eof)++ = '\n';
        }
        s->lim += cnt;
    }
    return cursor;
}
int scan(Scanner *s){
        uchar *cursor = s->cur;
std:
        s->tok = cursor;
#line 75
{
        YYCTYPE yych;
        unsigned int yyaccept;
        goto yy0;
yy1:	++YYCURSOR;
yy0:
        if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
        yych = *YYCURSOR;
        switch(yych){
        case '\t':	case '\v':
        case '\f':	case ' ':	goto yy6;
        case '\n':	goto yy8;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':	goto yy4;
        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z':	case '_':	case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z':	goto yy2;
        default:	goto yy10;
        }
yy2:	yych = *++YYCURSOR;
        goto yy17;
yy3:
#line 79
        { RET(ID); }
yy4:	yych = *++YYCURSOR;
        goto yy15;
yy5:
#line 81
        { RET(NUM); }
yy6:	yych = *++YYCURSOR;
        goto yy13;
yy7:
#line 82
        { goto std; }
yy8:	yych = *++YYCURSOR;
yy9:
#line 85
        {
                if(cursor == s->eof) RET(EOI);
                s->pos = cursor; s->line++;
                goto std;
            }
yy10:	yych = *++YYCURSOR;
yy11:
#line 92
        {
                printf("unexpected character: %c\n", *s->tok);
                goto std;
            }
yy12:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy13:	switch(yych){
        case '\t':	case '\v':
        case '\f':	case ' ':	goto yy12;
        default:	goto yy7;
        }
yy14:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy15:	switch(yych){
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':	goto yy14;
        default:	goto yy5;
        }
yy16:	++YYCURSOR;
        if(YYLIMIT == YYCURSOR) YYFILL(1);
        yych = *YYCURSOR;
yy17:	switch(yych){
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':	case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z':	case '_':	case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z':	goto yy16;
        default:	goto yy3;
        }
}
#line 96
}
main(){
    Scanner in;
    int t;
    memset((char*) &in, 0, sizeof(in));
    in.fd = 0;
    while((t = scan(&in)) != EOI){
/*
        printf("%d\t%.*s\n", t, in.cur - in.tok, in.tok);
        printf("%d\n", t);
*/
    }
    close(in.fd);
}
etc.
There is another option to generate small bitmaps for larger classes,
and use those in addition to the other two above (IE you can combine
-sb, or just use -b).
Which will generate the fastest code of course is dependent on your
compiler.  However, I doubt you could code a scanner significantly
faster than the above, unless your compiler is horrible about gotos or
something.
--Dan
-- "I went to a restaurant that serves "breakfast at any time." So I ordered French Toast during the Renaissance. "-Steven Wright