URI:
       tregcomp.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tregcomp.c (9706B)
       ---
            1 /* From libregexp but leaks extra classes when it runs out */
            2 
            3 
            4 #include <u.h>
            5 #include <libc.h>
            6 #include "regexp.h"
            7 #include "/sys/src/libregexp/regcomp.h"
            8 
            9 #define        TRUE        1
           10 #define        FALSE        0
           11 
           12 /*
           13  * Parser Information
           14  */
           15 typedef
           16 struct Node
           17 {
           18         Reinst*        first;
           19         Reinst*        last;
           20 }Node;
           21 
           22 #define        NSTACK        20
           23 static        Node        andstack[NSTACK];
           24 static        Node        *andp;
           25 static        int        atorstack[NSTACK];
           26 static        int*        atorp;
           27 static        int        cursubid;                /* id of current subexpression */
           28 static        int        subidstack[NSTACK];        /* parallel to atorstack */
           29 static        int*        subidp;
           30 static        int        lastwasand;        /* Last token was operand */
           31 static        int        nbra;
           32 static        char*        exprp;                /* pointer to next character in source expression */
           33 static        int        lexdone;
           34 static        int        nclass;
           35 static        Reclass*classp;
           36 static        Reinst*        freep;
           37 static        int        errors;
           38 static        Rune        yyrune;                /* last lex'd rune */
           39 static        Reclass*yyclassp;        /* last lex'd class */
           40 
           41 /* predeclared crap */
           42 static        void        operator(int);
           43 static        void        pushand(Reinst*, Reinst*);
           44 static        void        pushator(int);
           45 static        void        evaluntil(int);
           46 static        int        bldcclass(void);
           47 
           48 static jmp_buf regkaboom;
           49 
           50 static        void
           51 rcerror(char *s)
           52 {
           53         errors++;
           54         regerror(s);
           55         longjmp(regkaboom, 1);
           56 }
           57 
           58 static        Reinst*
           59 newinst(int t)
           60 {
           61         freep->type = t;
           62         freep->left = 0;
           63         freep->right = 0;
           64         return freep++;
           65 }
           66 
           67 static        void
           68 operand(int t)
           69 {
           70         Reinst *i;
           71 
           72         if(lastwasand)
           73                 operator(CAT);        /* catenate is implicit */
           74         i = newinst(t);
           75 
           76         if(t == CCLASS || t == NCCLASS)
           77                 i->cp = yyclassp;
           78         if(t == RUNE)
           79                 i->r = yyrune;
           80 
           81         pushand(i, i);
           82         lastwasand = TRUE;
           83 }
           84 
           85 static        void
           86 operator(int t)
           87 {
           88         if(t==RBRA && --nbra<0)
           89                 rcerror("unmatched right paren");
           90         if(t==LBRA){
           91                 if(++cursubid >= NSUBEXP)
           92                         rcerror ("too many subexpressions");
           93                 nbra++;
           94                 if(lastwasand)
           95                         operator(CAT);
           96         } else
           97                 evaluntil(t);
           98         if(t != RBRA)
           99                 pushator(t);
          100         lastwasand = FALSE;
          101         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
          102                 lastwasand = TRUE;        /* these look like operands */
          103 }
          104 
          105 static        void
          106 regerr2(char *s, int c)
          107 {
          108         char buf[100];
          109         char *cp = buf;
          110         while(*s)
          111                 *cp++ = *s++;
          112         *cp++ = c;
          113         *cp = '\0';
          114         rcerror(buf);
          115 }
          116 
          117 static        void
          118 cant(char *s)
          119 {
          120         char buf[100];
          121         strcpy(buf, "can't happen: ");
          122         strcat(buf, s);
          123         rcerror(buf);
          124 }
          125 
          126 static        void
          127 pushand(Reinst *f, Reinst *l)
          128 {
          129         if(andp >= &andstack[NSTACK])
          130                 cant("operand stack overflow");
          131         andp->first = f;
          132         andp->last = l;
          133         andp++;
          134 }
          135 
          136 static        void
          137 pushator(int t)
          138 {
          139         if(atorp >= &atorstack[NSTACK])
          140                 cant("operator stack overflow");
          141         *atorp++ = t;
          142         *subidp++ = cursubid;
          143 }
          144 
          145 static        Node*
          146 popand(int op)
          147 {
          148         Reinst *inst;
          149 
          150         if(andp <= &andstack[0]){
          151                 regerr2("missing operand for ", op);
          152                 inst = newinst(NOP);
          153                 pushand(inst,inst);
          154         }
          155         return --andp;
          156 }
          157 
          158 static        int
          159 popator(void)
          160 {
          161         if(atorp <= &atorstack[0])
          162                 cant("operator stack underflow");
          163         --subidp;
          164         return *--atorp;
          165 }
          166 
          167 static        void
          168 evaluntil(int pri)
          169 {
          170         Node *op1, *op2;
          171         Reinst *inst1, *inst2;
          172 
          173         while(pri==RBRA || atorp[-1]>=pri){
          174                 switch(popator()){
          175                 default:
          176                         rcerror("unknown operator in evaluntil");
          177                         break;
          178                 case LBRA:                /* must have been RBRA */
          179                         op1 = popand('(');
          180                         inst2 = newinst(RBRA);
          181                         inst2->subid = *subidp;
          182                         op1->last->next = inst2;
          183                         inst1 = newinst(LBRA);
          184                         inst1->subid = *subidp;
          185                         inst1->next = op1->first;
          186                         pushand(inst1, inst2);
          187                         return;
          188                 case OR:
          189                         op2 = popand('|');
          190                         op1 = popand('|');
          191                         inst2 = newinst(NOP);
          192                         op2->last->next = inst2;
          193                         op1->last->next = inst2;
          194                         inst1 = newinst(OR);
          195                         inst1->right = op1->first;
          196                         inst1->left = op2->first;
          197                         pushand(inst1, inst2);
          198                         break;
          199                 case CAT:
          200                         op2 = popand(0);
          201                         op1 = popand(0);
          202                         op1->last->next = op2->first;
          203                         pushand(op1->first, op2->last);
          204                         break;
          205                 case STAR:
          206                         op2 = popand('*');
          207                         inst1 = newinst(OR);
          208                         op2->last->next = inst1;
          209                         inst1->right = op2->first;
          210                         pushand(inst1, inst1);
          211                         break;
          212                 case PLUS:
          213                         op2 = popand('+');
          214                         inst1 = newinst(OR);
          215                         op2->last->next = inst1;
          216                         inst1->right = op2->first;
          217                         pushand(op2->first, inst1);
          218                         break;
          219                 case QUEST:
          220                         op2 = popand('?');
          221                         inst1 = newinst(OR);
          222                         inst2 = newinst(NOP);
          223                         inst1->left = inst2;
          224                         inst1->right = op2->first;
          225                         op2->last->next = inst2;
          226                         pushand(inst1, inst2);
          227                         break;
          228                 }
          229         }
          230 }
          231 
          232 static        Reprog*
          233 optimize(Reprog *pp)
          234 {
          235         Reinst *inst, *target;
          236         int size;
          237         Reprog *npp;
          238         Reclass *cl;
          239         int diff;
          240 
          241         /*
          242          *  get rid of NOOP chains
          243          */
          244         for(inst=pp->firstinst; inst->type!=END; inst++){
          245                 target = inst->next;
          246                 while(target->type == NOP)
          247                         target = target->next;
          248                 inst->next = target;
          249         }
          250 
          251         /*
          252          *  The original allocation is for an area larger than
          253          *  necessary.  Reallocate to the actual space used
          254          *  and then relocate the code.
          255          */
          256         size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
          257         npp = realloc(pp, size);
          258         if(npp==0 || npp==pp)
          259                 return pp;
          260         diff = (char *)npp - (char *)pp;
          261         freep = (Reinst *)((char *)freep + diff);
          262         for(inst=npp->firstinst; inst<freep; inst++){
          263                 switch(inst->type){
          264                 case OR:
          265                 case STAR:
          266                 case PLUS:
          267                 case QUEST:
          268                         *(char **)&inst->right += diff;
          269                         break;
          270                 case CCLASS:
          271                 case NCCLASS:
          272                         *(char **)&inst->right += diff;
          273                         cl = inst->cp;
          274                         *(char **)&cl->end += diff;
          275                         break;
          276                 }
          277                 *(char **)&inst->left += diff;
          278         }
          279         *(char **)&npp->startinst += diff;
          280         return npp;
          281 }
          282 
          283 #ifdef        DEBUG
          284 static        void
          285 dumpstack(void){
          286         Node *stk;
          287         int *ip;
          288 
          289         print("operators\n");
          290         for(ip=atorstack; ip<atorp; ip++)
          291                 print("0%o\n", *ip);
          292         print("operands\n");
          293         for(stk=andstack; stk<andp; stk++)
          294                 print("0%o\t0%o\n", stk->first->type, stk->last->type);
          295 }
          296 
          297 static        void
          298 dump(Reprog *pp)
          299 {
          300         Reinst *l;
          301         Rune *p;
          302 
          303         l = pp->firstinst;
          304         do{
          305                 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
          306                         l->left-pp->firstinst, l->right-pp->firstinst);
          307                 if(l->type == RUNE)
          308                         print("\t%C\n", l->r);
          309                 else if(l->type == CCLASS || l->type == NCCLASS){
          310                         print("\t[");
          311                         if(l->type == NCCLASS)
          312                                 print("^");
          313                         for(p = l->cp->spans; p < l->cp->end; p += 2)
          314                                 if(p[0] == p[1])
          315                                         print("%C", p[0]);
          316                                 else
          317                                         print("%C-%C", p[0], p[1]);
          318                         print("]\n");
          319                 } else
          320                         print("\n");
          321         }while(l++->type);
          322 }
          323 #endif
          324 
          325 static        Reclass*
          326 newclass(void)
          327 {
          328         if(nclass <= 0){
          329                 classp = mallocz(128*sizeof(Reclass), 1);
          330                 if(classp == nil)
          331                         regerror("out of memory");
          332                 nclass = 128;
          333         }
          334         return &classp[--nclass];
          335 }
          336 
          337 static        int
          338 nextc(Rune *rp)
          339 {
          340         if(lexdone){
          341                 *rp = 0;
          342                 return 1;
          343         }
          344         exprp += chartorune(rp, exprp);
          345         if(*rp == L'\\'){
          346                 exprp += chartorune(rp, exprp);
          347                 return 1;
          348         }
          349         if(*rp == 0)
          350                 lexdone = 1;
          351         return 0;
          352 }
          353 
          354 static        int
          355 lex(int literal, int dot_type)
          356 {
          357         int quoted;
          358 
          359         quoted = nextc(&yyrune);
          360         if(literal || quoted){
          361                 if(yyrune == 0)
          362                         return END;
          363                 return RUNE;
          364         }
          365 
          366         switch(yyrune){
          367         case 0:
          368                 return END;
          369         case L'*':
          370                 return STAR;
          371         case L'?':
          372                 return QUEST;
          373         case L'+':
          374                 return PLUS;
          375         case L'|':
          376                 return OR;
          377         case L'.':
          378                 return dot_type;
          379         case L'(':
          380                 return LBRA;
          381         case L')':
          382                 return RBRA;
          383         case L'^':
          384                 return BOL;
          385         case L'$':
          386                 return EOL;
          387         case L'[':
          388                 return bldcclass();
          389         }
          390         return RUNE;
          391 }
          392 
          393 static int
          394 bldcclass(void)
          395 {
          396         int type;
          397         Rune r[NCCRUNE];
          398         Rune *p, *ep, *np;
          399         Rune rune;
          400         int quoted;
          401 
          402         /* we have already seen the '[' */
          403         type = CCLASS;
          404         yyclassp = newclass();
          405 
          406         /* look ahead for negation */
          407         /* SPECIAL CASE!!! negated classes don't match \n */
          408         ep = r;
          409         quoted = nextc(&rune);
          410         if(!quoted && rune == L'^'){
          411                 type = NCCLASS;
          412                 quoted = nextc(&rune);
          413                 *ep++ = L'\n';
          414                 *ep++ = L'\n';
          415         }
          416 
          417         /* parse class into a set of spans */
          418         for(; ep<&r[NCCRUNE];){
          419                 if(rune == 0){
          420                         rcerror("malformed '[]'");
          421                         return 0;
          422                 }
          423                 if(!quoted && rune == L']')
          424                         break;
          425                 if(!quoted && rune == L'-'){
          426                         if(ep == r){
          427                                 rcerror("malformed '[]'");
          428                                 return 0;
          429                         }
          430                         quoted = nextc(&rune);
          431                         if((!quoted && rune == L']') || rune == 0){
          432                                 rcerror("malformed '[]'");
          433                                 return 0;
          434                         }
          435                         *(ep-1) = rune;
          436                 } else {
          437                         *ep++ = rune;
          438                         *ep++ = rune;
          439                 }
          440                 quoted = nextc(&rune);
          441         }
          442 
          443         /* sort on span start */
          444         for(p = r; p < ep; p += 2){
          445                 for(np = p; np < ep; np += 2)
          446                         if(*np < *p){
          447                                 rune = np[0];
          448                                 np[0] = p[0];
          449                                 p[0] = rune;
          450                                 rune = np[1];
          451                                 np[1] = p[1];
          452                                 p[1] = rune;
          453                         }
          454         }
          455 
          456         /* merge spans */
          457         np = yyclassp->spans;
          458         p = r;
          459         if(r == ep)
          460                 yyclassp->end = np;
          461         else {
          462                 np[0] = *p++;
          463                 np[1] = *p++;
          464                 for(; p < ep; p += 2)
          465                         if(p[0] <= np[1]){
          466                                 if(p[1] > np[1])
          467                                         np[1] = p[1];
          468                         } else {
          469                                 np += 2;
          470                                 np[0] = p[0];
          471                                 np[1] = p[1];
          472                         }
          473                 yyclassp->end = np+2;
          474         }
          475 
          476         return type;
          477 }
          478 
          479 static        Reprog*
          480 regcomp1(char *s, int literal, int dot_type)
          481 {
          482         int token;
          483         Reprog *pp;
          484 
          485         /* get memory for the program */
          486         pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
          487         if(pp == 0){
          488                 regerror("out of memory");
          489                 return 0;
          490         }
          491         freep = pp->firstinst;
          492         classp = pp->class;
          493         errors = 0;
          494 
          495         if(setjmp(regkaboom))
          496                 goto out;
          497 
          498         /* go compile the sucker */
          499         lexdone = 0;
          500         exprp = s;
          501         nclass = NCLASS;
          502         nbra = 0;
          503         atorp = atorstack;
          504         andp = andstack;
          505         subidp = subidstack;
          506         lastwasand = FALSE;
          507         cursubid = 0;
          508 
          509         /* Start with a low priority operator to prime parser */
          510         pushator(START-1);
          511         while((token = lex(literal, dot_type)) != END){
          512                 if((token&0300) == OPERATOR)
          513                         operator(token);
          514                 else
          515                         operand(token);
          516         }
          517 
          518         /* Close with a low priority operator */
          519         evaluntil(START);
          520 
          521         /* Force END */
          522         operand(END);
          523         evaluntil(START);
          524 #ifdef DEBUG
          525         dumpstack();
          526 #endif
          527         if(nbra)
          528                 rcerror("unmatched left paren");
          529         --andp;        /* points to first and only operand */
          530         pp->startinst = andp->first;
          531 #ifdef DEBUG
          532         dump(pp);
          533 #endif
          534         pp = optimize(pp);
          535 #ifdef DEBUG
          536         print("start: %d\n", andp->first-pp->firstinst);
          537         dump(pp);
          538 #endif
          539 out:
          540         if(errors){
          541                 free(pp);
          542                 pp = 0;
          543         }
          544         return pp;
          545 }
          546 
          547 extern        Reprog*
          548 regcomp(char *s)
          549 {
          550         return regcomp1(s, 0, ANY);
          551 }
          552 
          553 extern        Reprog*
          554 regcomplit(char *s)
          555 {
          556         return regcomp1(s, 1, ANY);
          557 }
          558 
          559 extern        Reprog*
          560 regcompnl(char *s)
          561 {
          562         return regcomp1(s, 0, ANYNL);
          563 }