URI:
       tregcomp.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tregcomp.c (10053B)
       ---
            1 #include "lib9.h"
            2 #include "regexp9.h"
            3 #include "regcomp.h"
            4 
            5 #define        TRUE        1
            6 #define        FALSE        0
            7 
            8 /*
            9  * Parser Information
           10  */
           11 typedef
           12 struct Node
           13 {
           14         Reinst*        first;
           15         Reinst*        last;
           16 }Node;
           17 
           18 /* max rune ranges per character class is nelem(classp->spans)/2 */
           19 #define NCCRUNE        nelem(classp->spans)
           20 
           21 #define        NSTACK        20
           22 static        Node        andstack[NSTACK];
           23 static        Node        *andp;
           24 static        int        atorstack[NSTACK];
           25 static        int*        atorp;
           26 static        int        cursubid;                /* id of current subexpression */
           27 static        int        subidstack[NSTACK];        /* parallel to atorstack */
           28 static        int*        subidp;
           29 static        int        lastwasand;        /* Last token was operand */
           30 static        int        nbra;
           31 static        char*        exprp;                /* pointer to next character in source expression */
           32 static        int        lexdone;
           33 static        int        nclass;
           34 static        Reclass*classp;
           35 static        Reinst*        freep;
           36 static        int        errors;
           37 static        Rune        yyrune;                /* last lex'd rune */
           38 static        Reclass*yyclassp;        /* last lex'd class */
           39 
           40 /* predeclared crap */
           41 static        void        operator(int);
           42 static        void        pushand(Reinst*, Reinst*);
           43 static        void        pushator(int);
           44 static        void        evaluntil(int);
           45 static        int        bldcclass(void);
           46 
           47 static jmp_buf regkaboom;
           48 
           49 static        void
           50 rcerror(char *s)
           51 {
           52         errors++;
           53         regerror(s);
           54         longjmp(regkaboom, 1);
           55 }
           56 
           57 static        Reinst*
           58 newinst(int t)
           59 {
           60         freep->type = t;
           61         freep->u2.left = 0;
           62         freep->u1.right = 0;
           63         return freep++;
           64 }
           65 
           66 static        void
           67 operand(int t)
           68 {
           69         Reinst *i;
           70 
           71         if(lastwasand)
           72                 operator(CAT);        /* catenate is implicit */
           73         i = newinst(t);
           74 
           75         if(t == CCLASS || t == NCCLASS)
           76                 i->u1.cp = yyclassp;
           77         if(t == RUNE)
           78                 i->u1.r = yyrune;
           79 
           80         pushand(i, i);
           81         lastwasand = TRUE;
           82 }
           83 
           84 static        void
           85 operator(int t)
           86 {
           87         if(t==RBRA && --nbra<0)
           88                 rcerror("unmatched right paren");
           89         if(t==LBRA){
           90                 if(++cursubid >= NSUBEXP)
           91                         rcerror ("too many subexpressions");
           92                 nbra++;
           93                 if(lastwasand)
           94                         operator(CAT);
           95         } else
           96                 evaluntil(t);
           97         if(t != RBRA)
           98                 pushator(t);
           99         lastwasand = FALSE;
          100         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
          101                 lastwasand = TRUE;        /* these look like operands */
          102 }
          103 
          104 static        void
          105 regerr2(char *s, int c)
          106 {
          107         char buf[100];
          108         char *cp = buf;
          109         while(*s)
          110                 *cp++ = *s++;
          111         *cp++ = c;
          112         *cp = '\0';
          113         rcerror(buf);
          114 }
          115 
          116 static        void
          117 cant(char *s)
          118 {
          119         char buf[100];
          120         strcpy(buf, "can't happen: ");
          121         strcat(buf, s);
          122         rcerror(buf);
          123 }
          124 
          125 static        void
          126 pushand(Reinst *f, Reinst *l)
          127 {
          128         if(andp >= &andstack[NSTACK])
          129                 cant("operand stack overflow");
          130         andp->first = f;
          131         andp->last = l;
          132         andp++;
          133 }
          134 
          135 static        void
          136 pushator(int t)
          137 {
          138         if(atorp >= &atorstack[NSTACK])
          139                 cant("operator stack overflow");
          140         *atorp++ = t;
          141         *subidp++ = cursubid;
          142 }
          143 
          144 static        Node*
          145 popand(int op)
          146 {
          147         Reinst *inst;
          148 
          149         if(andp <= &andstack[0]){
          150                 regerr2("missing operand for ", op);
          151                 inst = newinst(NOP);
          152                 pushand(inst,inst);
          153         }
          154         return --andp;
          155 }
          156 
          157 static        int
          158 popator(void)
          159 {
          160         if(atorp <= &atorstack[0])
          161                 cant("operator stack underflow");
          162         --subidp;
          163         return *--atorp;
          164 }
          165 
          166 static        void
          167 evaluntil(int pri)
          168 {
          169         Node *op1, *op2;
          170         Reinst *inst1, *inst2;
          171 
          172         while(pri==RBRA || atorp[-1]>=pri){
          173                 switch(popator()){
          174                 default:
          175                         rcerror("unknown operator in evaluntil");
          176                         break;
          177                 case LBRA:                /* must have been RBRA */
          178                         op1 = popand('(');
          179                         inst2 = newinst(RBRA);
          180                         inst2->u1.subid = *subidp;
          181                         op1->last->u2.next = inst2;
          182                         inst1 = newinst(LBRA);
          183                         inst1->u1.subid = *subidp;
          184                         inst1->u2.next = op1->first;
          185                         pushand(inst1, inst2);
          186                         return;
          187                 case OR:
          188                         op2 = popand('|');
          189                         op1 = popand('|');
          190                         inst2 = newinst(NOP);
          191                         op2->last->u2.next = inst2;
          192                         op1->last->u2.next = inst2;
          193                         inst1 = newinst(OR);
          194                         inst1->u1.right = op1->first;
          195                         inst1->u2.left = op2->first;
          196                         pushand(inst1, inst2);
          197                         break;
          198                 case CAT:
          199                         op2 = popand(0);
          200                         op1 = popand(0);
          201                         op1->last->u2.next = op2->first;
          202                         pushand(op1->first, op2->last);
          203                         break;
          204                 case STAR:
          205                         op2 = popand('*');
          206                         inst1 = newinst(OR);
          207                         op2->last->u2.next = inst1;
          208                         inst1->u1.right = op2->first;
          209                         pushand(inst1, inst1);
          210                         break;
          211                 case PLUS:
          212                         op2 = popand('+');
          213                         inst1 = newinst(OR);
          214                         op2->last->u2.next = inst1;
          215                         inst1->u1.right = op2->first;
          216                         pushand(op2->first, inst1);
          217                         break;
          218                 case QUEST:
          219                         op2 = popand('?');
          220                         inst1 = newinst(OR);
          221                         inst2 = newinst(NOP);
          222                         inst1->u2.left = inst2;
          223                         inst1->u1.right = op2->first;
          224                         op2->last->u2.next = inst2;
          225                         pushand(inst1, inst2);
          226                         break;
          227                 }
          228         }
          229 }
          230 
          231 static        Reprog*
          232 optimize(Reprog *pp)
          233 {
          234         Reinst *inst, *target;
          235         int size;
          236         Reprog *npp;
          237         Reclass *cl;
          238         ptrdiff_t diff;
          239 
          240         /*
          241          *  get rid of NOOP chains
          242          */
          243         for(inst=pp->firstinst; inst->type!=END; inst++){
          244                 target = inst->u2.next;
          245                 while(target->type == NOP)
          246                         target = target->u2.next;
          247                 inst->u2.next = target;
          248         }
          249 
          250         /*
          251          *  The original allocation is for an area larger than
          252          *  necessary.  Reallocate to the actual space used
          253          *  and then relocate the code.
          254          */
          255         size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
          256         npp = realloc(pp, size);
          257         if(npp==0 || npp==pp)
          258                 return pp;
          259         diff = (char *)npp - (char *)pp;
          260         freep = (Reinst *)((char *)freep + diff);
          261         for(inst=npp->firstinst; inst<freep; inst++){
          262                 switch(inst->type){
          263                 case OR:
          264                 case STAR:
          265                 case PLUS:
          266                 case QUEST:
          267                         inst->u1.right = (void*)((char*)inst->u1.right + diff);
          268                         break;
          269                 case CCLASS:
          270                 case NCCLASS:
          271                         inst->u1.right = (void*)((char*)inst->u1.right + diff);
          272                         cl = inst->u1.cp;
          273                         cl->end = (void*)((char*)cl->end + diff);
          274                         break;
          275                 }
          276                 inst->u2.left = (void*)((char*)inst->u2.left + diff);
          277         }
          278         npp->startinst = (void*)((char*)npp->startinst + diff);
          279         return npp;
          280 }
          281 
          282 #ifdef        DEBUG
          283 static        void
          284 dumpstack(void){
          285         Node *stk;
          286         int *ip;
          287 
          288         print("operators\n");
          289         for(ip=atorstack; ip<atorp; ip++)
          290                 print("0%o\n", *ip);
          291         print("operands\n");
          292         for(stk=andstack; stk<andp; stk++)
          293                 print("0%o\t0%o\n", stk->first->type, stk->last->type);
          294 }
          295 
          296 static        void
          297 dump(Reprog *pp)
          298 {
          299         Reinst *l;
          300         Rune *p;
          301 
          302         l = pp->firstinst;
          303         do{
          304                 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
          305                         l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
          306                 if(l->type == RUNE)
          307                         print("\t%C\n", l->u1.r);
          308                 else if(l->type == CCLASS || l->type == NCCLASS){
          309                         print("\t[");
          310                         if(l->type == NCCLASS)
          311                                 print("^");
          312                         for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
          313                                 if(p[0] == p[1])
          314                                         print("%C", p[0]);
          315                                 else
          316                                         print("%C-%C", p[0], p[1]);
          317                         print("]\n");
          318                 } else
          319                         print("\n");
          320         }while(l++->type);
          321 }
          322 #endif
          323 
          324 static        Reclass*
          325 newclass(void)
          326 {
          327         if(nclass >= nelem(((Reprog*)0)->class))
          328                 rcerror("too many character classes; increase Reprog.class size");
          329         return &(classp[nclass++]);
          330 }
          331 
          332 static        int
          333 nextc(Rune *rp)
          334 {
          335         if(lexdone){
          336                 *rp = 0;
          337                 return 1;
          338         }
          339         exprp += chartorune(rp, exprp);
          340         if(*rp == '\\'){
          341                 exprp += chartorune(rp, exprp);
          342                 return 1;
          343         }
          344         if(*rp == 0)
          345                 lexdone = 1;
          346         return 0;
          347 }
          348 
          349 static        int
          350 lex(int literal, int dot_type)
          351 {
          352         int quoted;
          353 
          354         quoted = nextc(&yyrune);
          355         if(literal || quoted){
          356                 if(yyrune == 0)
          357                         return END;
          358                 return RUNE;
          359         }
          360 
          361         switch(yyrune){
          362         case 0:
          363                 return END;
          364         case '*':
          365                 return STAR;
          366         case '?':
          367                 return QUEST;
          368         case '+':
          369                 return PLUS;
          370         case '|':
          371                 return OR;
          372         case '.':
          373                 return dot_type;
          374         case '(':
          375                 return LBRA;
          376         case ')':
          377                 return RBRA;
          378         case '^':
          379                 return BOL;
          380         case '$':
          381                 return EOL;
          382         case '[':
          383                 return bldcclass();
          384         }
          385         return RUNE;
          386 }
          387 
          388 static int
          389 bldcclass(void)
          390 {
          391         int type;
          392         Rune r[NCCRUNE];
          393         Rune *p, *ep, *np;
          394         Rune rune;
          395         int quoted;
          396 
          397         /* we have already seen the '[' */
          398         type = CCLASS;
          399         yyclassp = newclass();
          400 
          401         /* look ahead for negation */
          402         /* SPECIAL CASE!!! negated classes don't match \n */
          403         ep = r;
          404         quoted = nextc(&rune);
          405         if(!quoted && rune == '^'){
          406                 type = NCCLASS;
          407                 quoted = nextc(&rune);
          408                 *ep++ = '\n';
          409                 *ep++ = '\n';
          410         }
          411 
          412         /* parse class into a set of spans */
          413         while(ep < &r[NCCRUNE-1]){
          414                 if(rune == 0){
          415                         rcerror("malformed '[]'");
          416                         return 0;
          417                 }
          418                 if(!quoted && rune == ']')
          419                         break;
          420                 if(!quoted && rune == '-'){
          421                         if(ep == r){
          422                                 rcerror("malformed '[]'");
          423                                 return 0;
          424                         }
          425                         quoted = nextc(&rune);
          426                         if((!quoted && rune == ']') || rune == 0){
          427                                 rcerror("malformed '[]'");
          428                                 return 0;
          429                         }
          430                         *(ep-1) = rune;
          431                 } else {
          432                         *ep++ = rune;
          433                         *ep++ = rune;
          434                 }
          435                 quoted = nextc(&rune);
          436         }
          437         if(ep >= &r[NCCRUNE-1]) {
          438                 rcerror("char class too large; increase Reclass.spans size");
          439                 return 0;
          440         }
          441 
          442         /* sort on span start */
          443         for(p = r; p < ep; p += 2){
          444                 for(np = p; np < ep; np += 2)
          445                         if(*np < *p){
          446                                 rune = np[0];
          447                                 np[0] = p[0];
          448                                 p[0] = rune;
          449                                 rune = np[1];
          450                                 np[1] = p[1];
          451                                 p[1] = rune;
          452                         }
          453         }
          454 
          455         /* merge spans */
          456         np = yyclassp->spans;
          457         p = r;
          458         if(r == ep)
          459                 yyclassp->end = np;
          460         else {
          461                 np[0] = *p++;
          462                 np[1] = *p++;
          463                 for(; p < ep; p += 2)
          464                         /* overlapping or adjacent ranges? */
          465                         if(p[0] <= np[1] + 1){
          466                                 if(p[1] >= np[1])
          467                                         np[1] = p[1];        /* coalesce */
          468                         } else {
          469                                 np += 2;
          470                                 np[0] = p[0];
          471                                 np[1] = p[1];
          472                         }
          473                 yyclassp->end = np+2;
          474         }
          475 
          476         return type;
          477 }
          478 
          479 static        Reprog*
          480 regcomp1(char *s, int literal, int dot_type)
          481 {
          482         int token;
          483         Reprog *volatile pp;
          484 
          485         /* get memory for the program */
          486         pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
          487         if(pp == 0){
          488                 regerror("out of memory");
          489                 return 0;
          490         }
          491         freep = pp->firstinst;
          492         classp = pp->class;
          493         errors = 0;
          494 
          495         if(setjmp(regkaboom))
          496                 goto out;
          497 
          498         /* go compile the sucker */
          499         lexdone = 0;
          500         exprp = s;
          501         nclass = 0;
          502         nbra = 0;
          503         atorp = atorstack;
          504         andp = andstack;
          505         subidp = subidstack;
          506         lastwasand = FALSE;
          507         cursubid = 0;
          508 
          509         /* Start with a low priority operator to prime parser */
          510         pushator(START-1);
          511         while((token = lex(literal, dot_type)) != END){
          512                 if((token&0300) == OPERATOR)
          513                         operator(token);
          514                 else
          515                         operand(token);
          516         }
          517 
          518         /* Close with a low priority operator */
          519         evaluntil(START);
          520 
          521         /* Force END */
          522         operand(END);
          523         evaluntil(START);
          524 #ifdef DEBUG
          525         dumpstack();
          526 #endif
          527         if(nbra)
          528                 rcerror("unmatched left paren");
          529         --andp;        /* points to first and only operand */
          530         pp->startinst = andp->first;
          531 #ifdef DEBUG
          532         dump(pp);
          533 #endif
          534         pp = optimize(pp);
          535 #ifdef DEBUG
          536         print("start: %d\n", andp->first-pp->firstinst);
          537         dump(pp);
          538 #endif
          539 out:
          540         if(errors){
          541                 free(pp);
          542                 pp = 0;
          543         }
          544         return pp;
          545 }
          546 
          547 extern        Reprog*
          548 regcomp(char *s)
          549 {
          550         return regcomp1(s, 0, ANY);
          551 }
          552 
          553 extern        Reprog*
          554 regcomplit(char *s)
          555 {
          556         return regcomp1(s, 1, ANY);
          557 }
          558 
          559 extern        Reprog*
          560 regcompnl(char *s)
          561 {
          562         return regcomp1(s, 0, ANYNL);
          563 }