URI:
       tdict.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tdict.c (12530B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <regexp.h>
            5 #include <ctype.h>
            6 #include "dict.h"
            7 
            8 /*
            9  * Assumed index file structure: lines of form
           10  *         [^\t]+\t[0-9]+
           11  * First field is key, second is byte offset into dictionary.
           12  * Should be sorted with args -u -t'        ' +0f -1 +0 -1 +1n -2
           13  */
           14 typedef struct Addr Addr;
           15 
           16 struct Addr {
           17         int        n;                /* number of offsets */
           18         int        cur;                /* current position within doff array */
           19         int        maxn;                /* actual current size of doff array */
           20         ulong        doff[1];        /* doff[maxn], with 0..n-1 significant */
           21 };
           22 
           23 Biobuf        binbuf;
           24 Biobuf        boutbuf;
           25 Biobuf        *bin = &binbuf;                /* user cmd input */
           26 Biobuf        *bout = &boutbuf;        /* output */
           27 Biobuf        *bdict;                        /* dictionary */
           28 Biobuf        *bindex;                /* index file */
           29 long        indextop;                /* index offset at end of file */
           30 int        lastcmd;                /* last executed command */
           31 Addr        *dot;                        /* "current" address */
           32 Dict        *dict;                        /* current dictionary */
           33 int        linelen;
           34 int        breaklen = 60;
           35 int        outinhibit;
           36 int        debug;
           37 
           38 void        execcmd(int);
           39 int        getpref(char*, Rune*);
           40 Entry        getentry(int);
           41 int        getfield(Rune*);
           42 long        locate(Rune*);
           43 int        parseaddr(char*, char**);
           44 int        parsecmd(char*);
           45 int        search(char*, int);
           46 long        seeknextline(Biobuf*, long);
           47 void        setdotnext(void);
           48 void        setdotprev(void);
           49 void        sortaddr(Addr*);
           50 void        usage(void);
           51 
           52 enum {
           53         Plen=300,        /* max length of a search pattern */
           54         Fieldlen=200,        /* max length of an index field */
           55         Aslots=10        /* initial number of slots in an address */
           56 };
           57 
           58 void
           59 main(int argc, char **argv)
           60 {
           61         int i, cmd, kflag;
           62         char *line, *p;
           63 
           64         Binit(&binbuf, 0, OREAD);
           65         Binit(&boutbuf, 1, OWRITE);
           66         kflag = 0;
           67         line = 0;
           68         dict = 0;
           69 
           70         for(i=0; dicts[i].name; i++){
           71                 if(access(dictfile(dicts[i].path), 0)>=0 && access(dictfile(dicts[i].indexpath), 0)>=0){
           72                         dict = &dicts[i];
           73                         break;
           74                 }
           75         }
           76         ARGBEGIN {
           77                 case 'd':
           78                         p = ARGF();
           79                         dict = 0;
           80                         if(p) {
           81                                 for(i=0; dicts[i].name; i++)
           82                                         if(strcmp(p, dicts[i].name)==0) {
           83                                                 dict = &dicts[i];
           84                                                 break;
           85                                         }
           86                         }
           87                         if(!dict)
           88                                 usage();
           89                         break;
           90                 case 'c':
           91                         line = ARGF();
           92                         if(!line)
           93                                 usage();
           94                         break;
           95                 case 'k':
           96                         kflag++;
           97                         break;
           98                 case 'D':
           99                         debug++;
          100                         break;
          101                 default:
          102                         usage();
          103         ARGEND }
          104         if(dict == 0){
          105                 err("no dictionaries present on this system");
          106                 exits("nodict");
          107         }
          108 
          109         if(kflag) {
          110                 (*dict->printkey)();
          111                 exits(0);
          112         }
          113         if(argc > 1)
          114                 usage();
          115         else if(argc == 1) {
          116                 if(line)
          117                         usage();
          118                 p = argv[0];
          119                 line = malloc(strlen(p)+5);
          120                 sprint(line, "/%s/P\n", p);
          121         }
          122         dict->path = dictfile(dict->path);
          123         dict->indexpath = dictfile(dict->indexpath);
          124         bdict = Bopen(dict->path, OREAD);
          125         if(!bdict) {
          126                 err("can't open dictionary %s", dict->path);
          127                 exits("nodict");
          128         }
          129         bindex = Bopen(dict->indexpath, OREAD);
          130         if(!bindex) {
          131                 err("can't open index %s", dict->indexpath);
          132                 exits("noindex");
          133         }
          134         indextop = Bseek(bindex, 0L, 2);
          135 
          136         dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
          137         dot->n = 0;
          138         dot->cur = 0;
          139         dot->maxn = Aslots;
          140         lastcmd = 0;
          141 
          142         if(line) {
          143                 cmd = parsecmd(line);
          144                 if(cmd)
          145                         execcmd(cmd);
          146         } else {
          147                 for(;;) {
          148                         Bprint(bout, "*");
          149                         Bflush(bout);
          150                         line = Brdline(bin, '\n');
          151                         linelen = 0;
          152                         if(!line)
          153                                 break;
          154                         cmd = parsecmd(line);
          155                         if(cmd) {
          156                                 execcmd(cmd);
          157                                 lastcmd = cmd;
          158                         }
          159                 }
          160         }
          161         exits(0);
          162 }
          163 
          164 void
          165 usage(void)
          166 {
          167         int i;
          168         char *a, *b;
          169 
          170         Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
          171         Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
          172         for(i = 0; dicts[i].name; i++){
          173                 a = b = "";
          174                 if(access(dictfile(dicts[i].path), 0)<0 || access(dictfile(dicts[i].indexpath), 0)<0){
          175                         a = "[";
          176                         b = "]";
          177                 }
          178                 Bprint(bout, "   %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
          179         }
          180         exits("usage");
          181 }
          182 
          183 int
          184 parsecmd(char *line)
          185 {
          186         char *e;
          187         int cmd, ans;
          188 
          189         if(parseaddr(line, &e) >= 0)
          190                 line = e;
          191         else
          192                 return 0;
          193         cmd = *line;
          194         ans = cmd;
          195         if(isupper(cmd))
          196                 cmd = tolower(cmd);
          197         if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
          198              cmd == '\n')) {
          199                 err("unknown command %c", cmd);
          200                 return 0;
          201         }
          202         if(cmd == '\n')
          203                 switch(lastcmd) {
          204                         case 0:        ans = 'H'; break;
          205                         case 'H':        ans = 'p'; break;
          206                         default :        ans = lastcmd; break;
          207                 }
          208         else if(line[1] != '\n' && line[1] != 0)
          209                 err("extra stuff after command %c ignored", cmd);
          210         return ans;
          211 }
          212 
          213 void
          214 execcmd(int cmd)
          215 {
          216         Entry e;
          217         int cur, doall;
          218 
          219         if(isupper(cmd)) {
          220                 doall = 1;
          221                 cmd = tolower(cmd);
          222                 cur = 0;
          223         } else {
          224                 doall = 0;
          225                 cur = dot->cur;
          226         }
          227         if(debug && doall && cmd == 'a')
          228                 Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
          229         for(;;){
          230                 if(cur >= dot->n)
          231                         break;
          232                 if(doall) {
          233                         Bprint(bout, "%d\t", cur+1);
          234                         linelen += 4 + (cur >= 10);
          235                 }
          236                 switch(cmd) {
          237                 case 'a':
          238                         Bprint(bout, "#%lud\n", dot->doff[cur]);
          239                         break;
          240                 case 'h':
          241                 case 'p':
          242                 case 'r':
          243                         e = getentry(cur);
          244                         (*dict->printentry)(e, cmd);
          245                         break;
          246                 }
          247                 cur++;
          248                 if(doall) {
          249                         if(cmd == 'p' || cmd == 'r') {
          250                                 Bputc(bout, '\n');
          251                                 linelen = 0;
          252                         }
          253                 } else
          254                         break;
          255         }
          256         if(cur >= dot->n)
          257                 cur = 0;
          258         dot->cur = cur;
          259 }
          260 
          261 /*
          262  * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
          263  * Answer goes in dot.
          264  * Return -1 if address starts, but get error.
          265  * Return 0 if no address.
          266  */
          267 int
          268 parseaddr(char *line, char **eptr)
          269 {
          270         int delim, plen;
          271         ulong v;
          272         char *e;
          273         char pat[Plen];
          274 
          275         if(*line == '/' || *line == '!') {
          276                 /* anchored regular expression match; '!' means no folding */
          277                 if(*line == '/') {
          278                         delim = '/';
          279                         e = strpbrk(line+1, "/\n");
          280                 } else {
          281                         delim = '!';
          282                         e = strpbrk(line+1, "!\n");
          283                 }
          284                 plen = e-line-1;
          285                 if(plen >= Plen-3) {
          286                         err("pattern too big");
          287                         return -1;
          288                 }
          289                 pat[0] = '^';
          290                 memcpy(pat+1, line+1, plen);
          291                 pat[plen+1] = '$';
          292                 pat[plen+2] = 0;
          293                 if(*e == '\n')
          294                         line = e;
          295                 else
          296                         line = e+1;
          297                 if(!search(pat, delim == '/')) {
          298                         err("pattern not found");
          299                         return -1;
          300                 }
          301         } else if(*line == '#') {
          302                 /* absolute byte offset into dictionary */
          303                 line++;
          304                 if(!isdigit((uchar)*line))
          305                         return -1;
          306                 v = strtoul(line, &e, 10);
          307                 line = e;
          308                 dot->doff[0] = v;
          309                 dot->n = 1;
          310                 dot->cur = 0;
          311         } else if(isdigit((uchar)*line)) {
          312                 v = strtoul(line, &e, 10);
          313                 line = e;
          314                 if(v < 1 || v > dot->n)
          315                         err(".%d not in range [1,%d], ignored",
          316                                 v, dot->n);
          317                 else
          318                         dot->cur = v-1;
          319         } else if(*line == '.') {
          320                 line++;
          321         } else {
          322                 *eptr = line;
          323                 return 0;
          324         }
          325         while(*line == '+' || *line == '-') {
          326                 if(*line == '+')
          327                         setdotnext();
          328                 else
          329                         setdotprev();
          330                 line++;
          331         }
          332         *eptr = line;
          333         return 1;
          334 }
          335 
          336 /*
          337  * Index file is sorted by folded field1.
          338  * Method: find pre, a folded prefix of r.e. pat,
          339  * and then low = offset to beginning of
          340  * line in index file where first match of prefix occurs.
          341  * Then go through index until prefix no longer matches,
          342  * adding each line that matches real pattern to dot.
          343  * Finally, sort dot offsets (uniquing).
          344  * We know pat len < Plen, and that it is surrounded by ^..$
          345  */
          346 int
          347 search(char *pat, int dofold)
          348 {
          349         int needre, prelen, match, n;
          350         Reprog *re;
          351         long ioff, v;
          352         Rune pre[Plen];
          353         Rune lit[Plen];
          354         Rune entry[Fieldlen];
          355         char fpat[Plen];
          356 
          357         prelen = getpref(pat+1, pre);
          358         if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
          359                 runescpy(lit, pre);
          360                 if(dofold)
          361                         fold(lit);
          362                 needre = 0;
          363                 SET(re);
          364         } else {
          365                 needre = 1;
          366                 if(dofold) {
          367                         foldre(fpat, pat);
          368                         re = regcomp(fpat);
          369                 } else
          370                         re = regcomp(pat);
          371         }
          372         fold(pre);
          373         ioff = locate(pre);
          374         if(ioff < 0)
          375                 return 0;
          376         dot->n = 0;
          377         Bseek(bindex, ioff, 0);
          378         for(;;) {
          379                 if(!getfield(entry))
          380                         break;
          381                 if(dofold)
          382                         fold(entry);
          383                 if(needre)
          384                         match = rregexec(re, entry, 0, 0);
          385                 else
          386                         match = (acomp(lit, entry) == 0);
          387                 if(match) {
          388                         if(!getfield(entry))
          389                                 break;
          390                         v = runetol(entry);
          391                         if(dot->n >= dot->maxn) {
          392                                 n = 2*dot->maxn;
          393                                 dot = realloc(dot,
          394                                         sizeof(Addr)+(n-1)*sizeof(long));
          395                                 if(!dot) {
          396                                         err("out of memory");
          397                                         exits("nomem");
          398                                 }
          399                                 dot->maxn = n;
          400                         }
          401                         dot->doff[dot->n++] = v;
          402                 } else {
          403                         if(!dofold)
          404                                 fold(entry);
          405                         if(*pre) {
          406                                 n = acomp(pre, entry);
          407                                 if(n < -1 || (!needre && n < 0))
          408                                         break;
          409                         }
          410                         /* get to next index entry */
          411                         if(!getfield(entry))
          412                                 break;
          413                 }
          414         }
          415         sortaddr(dot);
          416         dot->cur = 0;
          417         return dot->n;
          418 }
          419 
          420 /*
          421  * Return offset in index file of first line whose folded
          422  * first field has pre as a prefix.  -1 if none found.
          423  */
          424 long
          425 locate(Rune *pre)
          426 {
          427         long top, bot, mid;
          428         Rune entry[Fieldlen];
          429 
          430         if(*pre == 0)
          431                 return 0;
          432         bot = 0;
          433         top = indextop;
          434         if(debug>1)
          435                 fprint(2, "locate looking for prefix %S\n", pre);
          436         for(;;) {
          437                 /*
          438                  * Loop invariant: foldkey(bot) < pre <= foldkey(top)
          439                  * and bot < top, and bot,top point at beginning of lines
          440                  */
          441                 mid = (top+bot) / 2;
          442                 mid = seeknextline(bindex, mid);
          443                 if(debug > 1)
          444                         fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
          445                                 bot, (top+bot) / 2, mid, top);
          446                 if(mid == top || !getfield(entry))
          447                         break;
          448                 if(debug > 1)
          449                         fprint(2, "key=%S\n", entry);
          450                 /*
          451                  * here mid is strictly between bot and top
          452                  */
          453                 fold(entry);
          454                 if(acomp(pre, entry) <= 0)
          455                         top = mid;
          456                 else
          457                         bot = mid;
          458         }
          459         /*
          460          * bot < top, but they don't necessarily point at successive lines
          461          * Use linear search from bot to find first line that pre is a
          462          * prefix of
          463          */
          464         while((bot = seeknextline(bindex, bot)) <= top) {
          465                 if(!getfield(entry))
          466                         return -1;
          467                 if(debug > 1)
          468                         fprint(2, "key=%S\n", entry);
          469                 fold(entry);
          470                 switch(acomp(pre, entry)) {
          471                 case -2:
          472                         return -1;
          473                 case -1:
          474                 case 0:
          475                         return bot;
          476                 case 1:
          477                 case 2:
          478                         continue;
          479                 }
          480         }
          481         return -1;
          482 
          483 }
          484 
          485 /*
          486  * Get prefix of non re-metacharacters, runified, into pre,
          487  * and return length
          488  */
          489 int
          490 getpref(char *pat, Rune *pre)
          491 {
          492         int n, r;
          493         char *p;
          494 
          495         p = pat;
          496         while(*p) {
          497                 n = chartorune(pre, p);
          498                 r = *pre;
          499                 switch(r) {
          500                 case 0x2e: case 0x2a: case 0x2b: case 0x3f:
          501                 case 0x5b: case 0x5d: case 0x28: case ')':
          502                 case 0x7c: case 0x5e: case 0x24:
          503                         *pre = 0;
          504                         return p-pat;
          505                 case '\\':
          506                         p += n;
          507                         p += chartorune(++pre, p);
          508                         pre++;
          509                         break;
          510                 default:
          511                         p += n;
          512                         pre++;
          513                 }
          514         }
          515         return p-pat;
          516 }
          517 
          518 long
          519 seeknextline(Biobuf *b, long off)
          520 {
          521         long c;
          522 
          523         Bseek(b, off, 0);
          524         do {
          525                 c = Bgetrune(b);
          526         } while(c>=0 && c!='\n');
          527         return Boffset(b);
          528 }
          529 
          530 /*
          531  * Get next field out of index file (either tab- or nl- terminated)
          532  * Answer in *rp, assumed to be Fieldlen long.
          533  * Return 0 if read error first.
          534  */
          535 int
          536 getfield(Rune *rp)
          537 {
          538         long c;
          539         int n;
          540 
          541         for(n=Fieldlen; n-- > 0; ) {
          542                 if ((c = Bgetrune(bindex)) < 0)
          543                         return 0;
          544                 if(c == '\t' || c == '\n') {
          545                         *rp = '\0';
          546                         return 1;
          547                 }
          548                 *rp++ = c;
          549         }
          550         err("word too long");
          551         return 0;
          552 }
          553 
          554 /*
          555  * A compare longs function suitable for qsort
          556  */
          557 static int
          558 longcmp(const void *av, const void *bv)
          559 {
          560         long v;
          561         long *a, *b;
          562 
          563         a = (long*)av;
          564         b = (long*)bv;
          565 
          566         v = *a - *b;
          567         if(v < 0)
          568                 return -1;
          569         else if(v == 0)
          570                 return 0;
          571         else
          572                 return 1;
          573 }
          574 
          575 void
          576 sortaddr(Addr *a)
          577 {
          578         int i, j;
          579         long v;
          580 
          581         if(a->n <= 1)
          582                 return;
          583 
          584         qsort(a->doff, a->n, sizeof(long), longcmp);
          585 
          586         /* remove duplicates */
          587         for(i=0, j=0; j < a->n; j++) {
          588                 v = a->doff[j];
          589                 if(i > 0 && v == a->doff[i-1])
          590                         continue;
          591                 a->doff[i++] = v;
          592         }
          593         a->n = i;
          594 }
          595 
          596 Entry
          597 getentry(int i)
          598 {
          599         long b, e, n;
          600         static Entry ans;
          601         static int anslen = 0;
          602 
          603         b = dot->doff[i];
          604         e = (*dict->nextoff)(b+1);
          605         ans.doff = b;
          606         if(e < 0) {
          607                 err("couldn't seek to entry");
          608                 ans.start = 0;
          609                 ans.end = 0;
          610         } else {
          611                 n = e-b;
          612                 if(n+1 > anslen) {
          613                         ans.start = realloc(ans.start, n+1);
          614                         if(!ans.start) {
          615                                 err("out of memory");
          616                                 exits("nomem");
          617                         }
          618                         anslen = n+1;
          619                 }
          620                 Bseek(bdict, b, 0);
          621                 n = Bread(bdict, ans.start, n);
          622                 ans.end = ans.start + n;
          623                 *ans.end = 0;
          624         }
          625         return ans;
          626 }
          627 
          628 void
          629 setdotnext(void)
          630 {
          631         long b;
          632 
          633         b = (*dict->nextoff)(dot->doff[dot->cur]+1);
          634         if(b < 0) {
          635                 err("couldn't find a next entry");
          636                 return;
          637         }
          638         dot->doff[0] = b;
          639         dot->n = 1;
          640         dot->cur = 0;
          641 }
          642 
          643 void
          644 setdotprev(void)
          645 {
          646         int tryback;
          647         long here, last, p;
          648 
          649         if(dot->cur < 0 || dot->cur >= dot->n)
          650                 return;
          651         tryback = 2000;
          652         here = dot->doff[dot->cur];
          653         last = 0;
          654         while(last == 0) {
          655                 p = here - tryback;
          656                 if(p < 0)
          657                         p = 0;
          658                 for(;;) {
          659                         p = (*dict->nextoff)(p+1);
          660                         if(p < 0)
          661                                 return; /* shouldn't happen */
          662                         if(p >= here)
          663                                 break;
          664                         last = p;
          665                 }
          666                 if(!last) {
          667                         if(here - tryback < 0) {
          668                                 err("can't find a previous entry");
          669                                 return;
          670                         }
          671                         tryback = 2*tryback;
          672                 }
          673         }
          674         dot->doff[0] = last;
          675         dot->n = 1;
          676         dot->cur = 0;
          677 }
          678 
          679 /*
          680  * find the specified file and return a path.
          681  * default location is #9/dict, but can be
          682  * in $dictdir instead.
          683  */
          684 char*
          685 dictfile(char *f)
          686 {
          687         static char *dict;
          688         static int did;
          689 
          690         if(!did){
          691                 dict = getenv("dictpath");
          692                 did = 1;
          693         }
          694 
          695         if(dict)
          696                 return smprint("%s/%s", dict, f);
          697         return unsharp(smprint("#9/dict/%s", f));
          698 }