URI:
       tcache.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tcache.c (43774B)
       ---
            1 #include "stdinc.h"
            2 #include "dat.h"
            3 #include "fns.h"
            4 #include "error.h"
            5 
            6 #include "9.h"        /* for cacheFlush */
            7 
            8 typedef struct FreeList FreeList;
            9 typedef struct BAddr BAddr;
           10 
           11 enum {
           12         BadHeap = ~0,
           13 };
           14 
           15 /*
           16  * Store data to the memory cache in c->size blocks
           17  * with the block zero extended to fill it out.  When writing to
           18  * Venti, the block will be zero truncated.  The walker will also check
           19  * that the block fits within psize or dsize as the case may be.
           20  */
           21 
           22 struct Cache
           23 {
           24         QLock        lk;
           25         int         ref;
           26         int        mode;
           27 
           28         Disk         *disk;
           29         int        size;                        /* block size */
           30         int        ndmap;                /* size of per-block dirty pointer map used in blockWrite */
           31         VtConn *z;
           32         u32int        now;                        /* ticks for usage timestamps */
           33         Block        **heads;                /* hash table for finding address */
           34         int        nheap;                        /* number of available victims */
           35         Block        **heap;                        /* heap for locating victims */
           36         long        nblocks;                /* number of blocks allocated */
           37         Block        *blocks;                /* array of block descriptors */
           38         u8int        *mem;                        /* memory for all block data & blists */
           39 
           40         BList        *blfree;
           41         Rendez        blrend;
           42 
           43         int         ndirty;                        /* number of dirty blocks in the cache */
           44         int         maxdirty;                /* max number of dirty blocks */
           45         u32int        vers;
           46 
           47         long hashSize;
           48 
           49         FreeList *fl;
           50 
           51         Rendez die;                        /* daemon threads should die when QLock != nil */
           52 
           53         Rendez flush;
           54         Rendez flushwait;
           55         Rendez heapwait;
           56         BAddr *baddr;
           57         int bw, br, be;
           58         int nflush;
           59 
           60         Periodic *sync;
           61 
           62         /* unlink daemon */
           63         BList *uhead;
           64         BList *utail;
           65         Rendez unlink;
           66 
           67         /* block counts */
           68         int nused;
           69         int ndisk;
           70 };
           71 
           72 struct BList {
           73         int part;
           74         u32int addr;
           75         uchar type;
           76         u32int tag;
           77         u32int epoch;
           78         u32int vers;
           79 
           80         int recurse;        /* for block unlink */
           81 
           82         /* for roll back */
           83         int index;                        /* -1 indicates not valid */
           84         union {
           85                 uchar score[VtScoreSize];
           86                 uchar entry[VtEntrySize];
           87         } old;
           88         BList *next;
           89 };
           90 
           91 struct BAddr {
           92         int part;
           93         u32int addr;
           94         u32int vers;
           95 };
           96 
           97 struct FreeList {
           98         QLock lk;
           99         u32int last;                /* last block allocated */
          100         u32int end;                /* end of data partition */
          101         u32int nused;                /* number of used blocks */
          102         u32int epochLow;        /* low epoch when last updated nused */
          103 };
          104 
          105 static FreeList *flAlloc(u32int end);
          106 static void flFree(FreeList *fl);
          107 
          108 static Block *cacheBumpBlock(Cache *c);
          109 static void heapDel(Block*);
          110 static void heapIns(Block*);
          111 static void cacheCheck(Cache*);
          112 static void unlinkThread(void *a);
          113 static void flushThread(void *a);
          114 static void unlinkBody(Cache *c);
          115 static int cacheFlushBlock(Cache *c);
          116 static void cacheSync(void*);
          117 static BList *blistAlloc(Block*);
          118 static void blistFree(Cache*, BList*);
          119 static void doRemoveLink(Cache*, BList*);
          120 
          121 /*
          122  * Mapping from local block type to Venti type
          123  */
          124 int vtType[BtMax] = {
          125         VtDataType,                /* BtData | 0  */
          126         VtDataType+1,                /* BtData | 1  */
          127         VtDataType+2,                /* BtData | 2  */
          128         VtDataType+3,                /* BtData | 3  */
          129         VtDataType+4,                /* BtData | 4  */
          130         VtDataType+5,                /* BtData | 5  */
          131         VtDataType+6,                /* BtData | 6  */
          132         VtDataType+7,                /* BtData | 7  */
          133         VtDirType,                /* BtDir | 0  */
          134         VtDirType+1,                /* BtDir | 1  */
          135         VtDirType+2,                /* BtDir | 2  */
          136         VtDirType+3,                /* BtDir | 3  */
          137         VtDirType+4,                /* BtDir | 4  */
          138         VtDirType+5,                /* BtDir | 5  */
          139         VtDirType+6,                /* BtDir | 6  */
          140         VtDirType+7,                /* BtDir | 7  */
          141 };
          142 
          143 /*
          144  * Allocate the memory cache.
          145  */
          146 Cache *
          147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode)
          148 {
          149         int i;
          150         Cache *c;
          151         Block *b;
          152         BList *bl;
          153         u8int *p;
          154         int nbl;
          155 
          156         c = vtmallocz(sizeof(Cache));
          157 
          158         /* reasonable number of BList elements */
          159         nbl = nblocks * 4;
          160 
          161         c->ref = 1;
          162         c->disk = disk;
          163         c->z = z;
          164         c->size = diskBlockSize(disk);
          165 bwatchSetBlockSize(c->size);
          166         /* round c->size up to be a nice multiple */
          167         c->size = (c->size + 127) & ~127;
          168         c->ndmap = (c->size/20 + 7) / 8;
          169         c->nblocks = nblocks;
          170         c->hashSize = nblocks;
          171         c->heads = vtmallocz(c->hashSize*sizeof(Block*));
          172         c->heap = vtmallocz(nblocks*sizeof(Block*));
          173         c->blocks = vtmallocz(nblocks*sizeof(Block));
          174         c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
          175         c->baddr = vtmallocz(nblocks * sizeof(BAddr));
          176         c->mode = mode;
          177         c->vers++;
          178         p = c->mem;
          179         for(i = 0; i < nblocks; i++){
          180                 b = &c->blocks[i];
          181                 b->c = c;
          182                 b->data = p;
          183                 b->heap = i;
          184                 b->ioready.l = &b->lk;
          185                 c->heap[i] = b;
          186                 p += c->size;
          187         }
          188         c->nheap = nblocks;
          189         for(i = 0; i < nbl; i++){
          190                 bl = (BList*)p;
          191                 bl->next = c->blfree;
          192                 c->blfree = bl;
          193                 p += sizeof(BList);
          194         }
          195         /* separate loop to keep blocks and blists reasonably aligned */
          196         for(i = 0; i < nblocks; i++){
          197                 b = &c->blocks[i];
          198                 b->dmap = p;
          199                 p += c->ndmap;
          200         }
          201 
          202         c->blrend.l = &c->lk;
          203 
          204         c->maxdirty = nblocks*(DirtyPercentage*0.01);
          205 
          206         c->fl = flAlloc(diskSize(disk, PartData));
          207 
          208         c->unlink.l = &c->lk;
          209         c->flush.l = &c->lk;
          210         c->flushwait.l = &c->lk;
          211         c->heapwait.l = &c->lk;
          212         c->sync = periodicAlloc(cacheSync, c, 30*1000);
          213 
          214         if(mode == OReadWrite){
          215                 c->ref += 2;
          216                 proccreate(unlinkThread, c, STACK);
          217                 proccreate(flushThread, c, STACK);
          218         }
          219         cacheCheck(c);
          220 
          221         return c;
          222 }
          223 
          224 /*
          225  * Free the whole memory cache, flushing all dirty blocks to the disk.
          226  */
          227 void
          228 cacheFree(Cache *c)
          229 {
          230         int i;
          231 
          232         /* kill off daemon threads */
          233         qlock(&c->lk);
          234         c->die.l = &c->lk;
          235         periodicKill(c->sync);
          236         rwakeup(&c->flush);
          237         rwakeup(&c->unlink);
          238         while(c->ref > 1)
          239                 rsleep(&c->die);
          240 
          241         /* flush everything out */
          242         do {
          243                 unlinkBody(c);
          244                 qunlock(&c->lk);
          245                 while(cacheFlushBlock(c))
          246                         ;
          247                 diskFlush(c->disk);
          248                 qlock(&c->lk);
          249         } while(c->uhead || c->ndirty);
          250         qunlock(&c->lk);
          251 
          252         cacheCheck(c);
          253 
          254         for(i = 0; i < c->nblocks; i++){
          255                 assert(c->blocks[i].ref == 0);
          256         }
          257         flFree(c->fl);
          258         vtfree(c->baddr);
          259         vtfree(c->heads);
          260         vtfree(c->blocks);
          261         vtfree(c->mem);
          262         diskFree(c->disk);
          263         /* don't close vtSession */
          264         vtfree(c);
          265 }
          266 
          267 static void
          268 cacheDump(Cache *c)
          269 {
          270         int i;
          271         Block *b;
          272 
          273         for(i = 0; i < c->nblocks; i++){
          274                 b = &c->blocks[i];
          275                 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
          276                         i, b->part, b->addr, b->score, b->l.type, b->ref,
          277                         bsStr(b->l.state), bioStr(b->iostate), b->pc);
          278         }
          279 }
          280 
          281 static void
          282 cacheCheck(Cache *c)
          283 {
          284         u32int size, now;
          285         int i, k, refed;
          286         Block *b;
          287 
          288         size = c->size;
          289         now = c->now;
          290 
          291         for(i = 0; i < c->nheap; i++){
          292                 if(c->heap[i]->heap != i)
          293                         sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
          294                 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
          295                         sysfatal("bad heap ordering");
          296                 k = (i << 1) + 1;
          297                 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
          298                         sysfatal("bad heap ordering");
          299                 k++;
          300                 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
          301                         sysfatal("bad heap ordering");
          302         }
          303 
          304         refed = 0;
          305         for(i = 0; i < c->nblocks; i++){
          306                 b = &c->blocks[i];
          307                 if(b->data != &c->mem[i * size])
          308                         sysfatal("mis-blocked at %d", i);
          309                 if(b->ref && b->heap == BadHeap){
          310                         refed++;
          311                 }
          312         }
          313 if(c->nheap + refed != c->nblocks){
          314 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
          315 cacheDump(c);
          316 }
          317         assert(c->nheap + refed == c->nblocks);
          318         refed = 0;
          319         for(i = 0; i < c->nblocks; i++){
          320                 b = &c->blocks[i];
          321                 if(b->ref){
          322 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
          323                         refed++;
          324                 }
          325         }
          326 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
          327 }
          328 
          329 
          330 /*
          331  * locate the block with the oldest second to last use.
          332  * remove it from the heap, and fix up the heap.
          333  */
          334 /* called with c->lk held */
          335 static Block *
          336 cacheBumpBlock(Cache *c)
          337 {
          338         int printed;
          339         Block *b;
          340 
          341         /*
          342          * locate the block with the oldest second to last use.
          343          * remove it from the heap, and fix up the heap.
          344          */
          345         printed = 0;
          346         if(c->nheap == 0){
          347                 while(c->nheap == 0){
          348                         rwakeup(&c->flush);
          349                         rsleep(&c->heapwait);
          350                         if(c->nheap == 0){
          351                                 printed = 1;
          352                                 fprint(2, "%s: entire cache is busy, %d dirty "
          353                                         "-- waking flush thread\n",
          354                                         argv0, c->ndirty);
          355                         }
          356                 }
          357                 if(printed)
          358                         fprint(2, "%s: cache is okay again, %d dirty\n",
          359                                 argv0, c->ndirty);
          360         }
          361 
          362         b = c->heap[0];
          363         heapDel(b);
          364 
          365         assert(b->heap == BadHeap);
          366         assert(b->ref == 0);
          367         assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
          368         assert(b->prior == nil);
          369         assert(b->uhead == nil);
          370 
          371         /*
          372          * unchain the block from hash chain
          373          */
          374         if(b->prev){
          375                 *(b->prev) = b->next;
          376                 if(b->next)
          377                         b->next->prev = b->prev;
          378                 b->prev = nil;
          379         }
          380 
          381 
          382 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
          383         /* set block to a reasonable state */
          384         b->ref = 1;
          385         b->part = PartError;
          386         memset(&b->l, 0, sizeof(b->l));
          387         b->iostate = BioEmpty;
          388 
          389         return b;
          390 }
          391 
          392 /*
          393  * look for a particular version of the block in the memory cache.
          394  */
          395 static Block *
          396 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
          397         int waitlock, int *lockfailure)
          398 {
          399         Block *b;
          400         ulong h;
          401 
          402         h = addr % c->hashSize;
          403 
          404         if(lockfailure)
          405                 *lockfailure = 0;
          406 
          407         /*
          408          * look for the block in the cache
          409          */
          410         qlock(&c->lk);
          411         for(b = c->heads[h]; b != nil; b = b->next){
          412                 if(b->part == part && b->addr == addr)
          413                         break;
          414         }
          415         if(b == nil || b->vers != vers){
          416                 qunlock(&c->lk);
          417                 return nil;
          418         }
          419         if(!waitlock && !canqlock(&b->lk)){
          420                 *lockfailure = 1;
          421                 qunlock(&c->lk);
          422                 return nil;
          423         }
          424         heapDel(b);
          425         b->ref++;
          426         qunlock(&c->lk);
          427 
          428         bwatchLock(b);
          429         if(waitlock)
          430                 qlock(&b->lk);
          431         b->nlock = 1;
          432 
          433         for(;;){
          434                 switch(b->iostate){
          435                 default:
          436                         abort();
          437                 case BioEmpty:
          438                 case BioLabel:
          439                 case BioClean:
          440                 case BioDirty:
          441                         if(b->vers != vers){
          442                                 blockPut(b);
          443                                 return nil;
          444                         }
          445                         return b;
          446                 case BioReading:
          447                 case BioWriting:
          448                         rsleep(&b->ioready);
          449                         break;
          450                 case BioVentiError:
          451                         blockPut(b);
          452                         werrstr("venti i/o error block 0x%.8ux", addr);
          453                         return nil;
          454                 case BioReadError:
          455                         blockPut(b);
          456                         werrstr("error reading block 0x%.8ux", addr);
          457                         return nil;
          458                 }
          459         }
          460         /* NOT REACHED */
          461 }
          462 
          463 
          464 /*
          465  * fetch a local (on-disk) block from the memory cache.
          466  * if it's not there, load it, bumping some other block.
          467  */
          468 Block *
          469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
          470 {
          471         Block *b;
          472         ulong h;
          473 
          474         assert(part != PartVenti);
          475 
          476         h = addr % c->hashSize;
          477 
          478         /*
          479          * look for the block in the cache
          480          */
          481         qlock(&c->lk);
          482         for(b = c->heads[h]; b != nil; b = b->next){
          483                 if(b->part != part || b->addr != addr)
          484                         continue;
          485                 if(epoch && b->l.epoch != epoch){
          486 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
          487                         qunlock(&c->lk);
          488                         werrstr(ELabelMismatch);
          489                         return nil;
          490                 }
          491                 heapDel(b);
          492                 b->ref++;
          493                 break;
          494         }
          495 
          496         if(b == nil){
          497                 b = cacheBumpBlock(c);
          498 
          499                 b->part = part;
          500                 b->addr = addr;
          501                 localToGlobal(addr, b->score);
          502 
          503                 /* chain onto correct hash */
          504                 b->next = c->heads[h];
          505                 c->heads[h] = b;
          506                 if(b->next != nil)
          507                         b->next->prev = &b->next;
          508                 b->prev = &c->heads[h];
          509         }
          510 
          511         qunlock(&c->lk);
          512 
          513         /*
          514          * BUG: what if the epoch changes right here?
          515          * In the worst case, we could end up in some weird
          516          * lock loop, because the block we want no longer exists,
          517          * and instead we're trying to lock a block we have no
          518          * business grabbing.
          519          *
          520          * For now, I'm not going to worry about it.
          521          */
          522 
          523 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
          524         bwatchLock(b);
          525         qlock(&b->lk);
          526         b->nlock = 1;
          527 
          528         if(part == PartData && b->iostate == BioEmpty){
          529                 if(!readLabel(c, &b->l, addr)){
          530                         blockPut(b);
          531                         return nil;
          532                 }
          533                 blockSetIOState(b, BioLabel);
          534         }
          535         if(epoch && b->l.epoch != epoch){
          536                 blockPut(b);
          537 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
          538                 werrstr(ELabelMismatch);
          539                 return nil;
          540         }
          541 
          542         b->pc = getcallerpc(&c);
          543         for(;;){
          544                 switch(b->iostate){
          545                 default:
          546                         abort();
          547                 case BioLabel:
          548                         if(mode == OOverWrite)
          549                                 /*
          550                                  * leave iostate as BioLabel because data
          551                                  * hasn't been read.
          552                                  */
          553                                 return b;
          554                         /* fall through */
          555                 case BioEmpty:
          556                         diskRead(c->disk, b);
          557                         rsleep(&b->ioready);
          558                         break;
          559                 case BioClean:
          560                 case BioDirty:
          561                         return b;
          562                 case BioReading:
          563                 case BioWriting:
          564                         rsleep(&b->ioready);
          565                         break;
          566                 case BioReadError:
          567                         blockSetIOState(b, BioEmpty);
          568                         blockPut(b);
          569                         werrstr("error reading block 0x%.8ux", addr);
          570                         return nil;
          571                 }
          572         }
          573         /* NOT REACHED */
          574 }
          575 
          576 Block *
          577 cacheLocal(Cache *c, int part, u32int addr, int mode)
          578 {
          579         return _cacheLocal(c, part, addr, mode, 0);
          580 }
          581 
          582 /*
          583  * fetch a local (on-disk) block from the memory cache.
          584  * if it's not there, load it, bumping some other block.
          585  * check tag and type.
          586  */
          587 Block *
          588 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
          589 {
          590         Block *b;
          591 
          592         b = _cacheLocal(c, PartData, addr, mode, epoch);
          593         if(b == nil)
          594                 return nil;
          595         if(b->l.type != type || b->l.tag != tag){
          596                 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
          597                         argv0, addr, b->l.type, type, b->l.tag, tag);
          598                 werrstr(ELabelMismatch);
          599                 blockPut(b);
          600                 return nil;
          601         }
          602         b->pc = getcallerpc(&c);
          603         return b;
          604 }
          605 
          606 /*
          607  * fetch a global (Venti) block from the memory cache.
          608  * if it's not there, load it, bumping some other block.
          609  * check tag and type if it's really a local block in disguise.
          610  */
          611 Block *
          612 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
          613 {
          614         int n;
          615         Block *b;
          616         ulong h;
          617         u32int addr;
          618 
          619         addr = globalToLocal(score);
          620         if(addr != NilBlock){
          621                 b = cacheLocalData(c, addr, type, tag, mode, 0);
          622                 if(b)
          623                         b->pc = getcallerpc(&c);
          624                 return b;
          625         }
          626 
          627         h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
          628 
          629         /*
          630          * look for the block in the cache
          631          */
          632         qlock(&c->lk);
          633         for(b = c->heads[h]; b != nil; b = b->next){
          634                 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
          635                         continue;
          636                 heapDel(b);
          637                 b->ref++;
          638                 break;
          639         }
          640 
          641         if(b == nil){
          642 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
          643 
          644                 b = cacheBumpBlock(c);
          645 
          646                 b->part = PartVenti;
          647                 b->addr = NilBlock;
          648                 b->l.type = type;
          649                 memmove(b->score, score, VtScoreSize);
          650 
          651                 /* chain onto correct hash */
          652                 b->next = c->heads[h];
          653                 c->heads[h] = b;
          654                 if(b->next != nil)
          655                         b->next->prev = &b->next;
          656                 b->prev = &c->heads[h];
          657         }
          658         qunlock(&c->lk);
          659 
          660         bwatchLock(b);
          661         qlock(&b->lk);
          662         b->nlock = 1;
          663         b->pc = getcallerpc(&c);
          664 
          665         switch(b->iostate){
          666         default:
          667                 abort();
          668         case BioEmpty:
          669                 n = vtread(c->z, score, vtType[type], b->data, c->size);
          670                 if(n < 0 || vtsha1check(score, b->data, n) < 0){
          671                         blockSetIOState(b, BioVentiError);
          672                         blockPut(b);
          673                         werrstr(
          674                         "venti error reading block %V or wrong score: %r",
          675                                 score);
          676                         return nil;
          677                 }
          678                 vtzeroextend(vtType[type], b->data, n, c->size);
          679                 blockSetIOState(b, BioClean);
          680                 return b;
          681         case BioClean:
          682                 return b;
          683         case BioVentiError:
          684                 blockPut(b);
          685                 werrstr("venti i/o error or wrong score, block %V", score);
          686                 return nil;
          687         case BioReadError:
          688                 blockPut(b);
          689                 werrstr("error reading block %V", b->score);
          690                 return nil;
          691         }
          692         /* NOT REACHED */
          693 }
          694 
          695 /*
          696  * allocate a new on-disk block and load it into the memory cache.
          697  * BUG: if the disk is full, should we flush some of it to Venti?
          698  */
          699 static u32int lastAlloc;
          700 
          701 Block *
          702 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
          703 {
          704         FreeList *fl;
          705         u32int addr;
          706         Block *b;
          707         int n, nwrap;
          708         Label lab;
          709 
          710         n = c->size / LabelSize;
          711         fl = c->fl;
          712 
          713         qlock(&fl->lk);
          714         addr = fl->last;
          715         b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
          716         if(b == nil){
          717                 fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0);
          718                 qunlock(&fl->lk);
          719                 return nil;
          720         }
          721         nwrap = 0;
          722         for(;;){
          723                 if(++addr >= fl->end){
          724                         addr = 0;
          725                         if(++nwrap >= 2){
          726                                 blockPut(b);
          727                                 werrstr("disk is full");
          728                                 /*
          729                                  * try to avoid a continuous spew of console
          730                                  * messages.
          731                                  */
          732                                 if (fl->last != 0)
          733                                         fprint(2, "%s: cacheAllocBlock: xxx1 %r\n",
          734                                                 argv0);
          735                                 fl->last = 0;
          736                                 qunlock(&fl->lk);
          737                                 return nil;
          738                         }
          739                 }
          740                 if(addr%n == 0){
          741                         blockPut(b);
          742                         b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
          743                         if(b == nil){
          744                                 fl->last = addr;
          745                                 fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0);
          746                                 qunlock(&fl->lk);
          747                                 return nil;
          748                         }
          749                 }
          750                 if(!labelUnpack(&lab, b->data, addr%n))
          751                         continue;
          752                 if(lab.state == BsFree)
          753                         goto Found;
          754                 if(lab.state&BsClosed)
          755                 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
          756                         goto Found;
          757         }
          758 Found:
          759         blockPut(b);
          760         b = cacheLocal(c, PartData, addr, OOverWrite);
          761         if(b == nil){
          762                 fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0);
          763                 return nil;
          764         }
          765 assert(b->iostate == BioLabel || b->iostate == BioClean);
          766         fl->last = addr;
          767         lab.type = type;
          768         lab.tag = tag;
          769         lab.state = BsAlloc;
          770         lab.epoch = epoch;
          771         lab.epochClose = ~(u32int)0;
          772         if(!blockSetLabel(b, &lab, 1)){
          773                 fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0);
          774                 blockPut(b);
          775                 return nil;
          776         }
          777         vtzeroextend(vtType[type], b->data, 0, c->size);
          778 if(0)diskWrite(c->disk, b);
          779 
          780 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
          781         lastAlloc = addr;
          782         fl->nused++;
          783         qunlock(&fl->lk);
          784         b->pc = getcallerpc(&c);
          785         return b;
          786 }
          787 
          788 int
          789 cacheDirty(Cache *c)
          790 {
          791         return c->ndirty;
          792 }
          793 
          794 void
          795 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
          796 {
          797         int n;
          798         u32int addr, nused;
          799         Block *b;
          800         Label lab;
          801         FreeList *fl;
          802 
          803         fl = c->fl;
          804         n = c->size / LabelSize;
          805         *bsize = c->size;
          806         qlock(&fl->lk);
          807         if(fl->epochLow == epochLow){
          808                 *used = fl->nused;
          809                 *total = fl->end;
          810                 qunlock(&fl->lk);
          811                 return;
          812         }
          813         b = nil;
          814         nused = 0;
          815         for(addr=0; addr<fl->end; addr++){
          816                 if(addr%n == 0){
          817                         blockPut(b);
          818                         b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
          819                         if(b == nil){
          820                                 fprint(2, "%s: flCountUsed: loading %ux: %r\n",
          821                                         argv0, addr/n);
          822                                 break;
          823                         }
          824                 }
          825                 if(!labelUnpack(&lab, b->data, addr%n))
          826                         continue;
          827                 if(lab.state == BsFree)
          828                         continue;
          829                 if(lab.state&BsClosed)
          830                 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
          831                         continue;
          832                 nused++;
          833         }
          834         blockPut(b);
          835         if(addr == fl->end){
          836                 fl->nused = nused;
          837                 fl->epochLow = epochLow;
          838         }
          839         *used = nused;
          840         *total = fl->end;
          841         qunlock(&fl->lk);
          842         return;
          843 }
          844 
          845 static FreeList *
          846 flAlloc(u32int end)
          847 {
          848         FreeList *fl;
          849 
          850         fl = vtmallocz(sizeof(*fl));
          851         fl->last = 0;
          852         fl->end = end;
          853         return fl;
          854 }
          855 
          856 static void
          857 flFree(FreeList *fl)
          858 {
          859         vtfree(fl);
          860 }
          861 
          862 u32int
          863 cacheLocalSize(Cache *c, int part)
          864 {
          865         return diskSize(c->disk, part);
          866 }
          867 
          868 /*
          869  * The thread that has locked b may refer to it by
          870  * multiple names.  Nlock counts the number of
          871  * references the locking thread holds.  It will call
          872  * blockPut once per reference.
          873  */
          874 void
          875 blockDupLock(Block *b)
          876 {
          877         assert(b->nlock > 0);
          878         b->nlock++;
          879 }
          880 
          881 /*
          882  * we're done with the block.
          883  * unlock it.  can't use it after calling this.
          884  */
          885 void
          886 blockPut(Block* b)
          887 {
          888         Cache *c;
          889 
          890         if(b == nil)
          891                 return;
          892 
          893 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
          894 
          895         if(b->iostate == BioDirty)
          896                 bwatchDependency(b);
          897 
          898         if(--b->nlock > 0)
          899                 return;
          900 
          901         /*
          902          * b->nlock should probably stay at zero while
          903          * the block is unlocked, but diskThread and rsleep
          904          * conspire to assume that they can just qlock(&b->lk); blockPut(b),
          905          * so we have to keep b->nlock set to 1 even
          906          * when the block is unlocked.
          907          */
          908         assert(b->nlock == 0);
          909         b->nlock = 1;
          910 //        b->pc = 0;
          911 
          912         bwatchUnlock(b);
          913         qunlock(&b->lk);
          914         c = b->c;
          915         qlock(&c->lk);
          916 
          917         if(--b->ref > 0){
          918                 qunlock(&c->lk);
          919                 return;
          920         }
          921 
          922         assert(b->ref == 0);
          923         switch(b->iostate){
          924         default:
          925                 b->used = c->now++;
          926                 heapIns(b);
          927                 break;
          928         case BioEmpty:
          929         case BioLabel:
          930                 if(c->nheap == 0)
          931                         b->used = c->now++;
          932                 else
          933                         b->used = c->heap[0]->used;
          934                 heapIns(b);
          935                 break;
          936         case BioDirty:
          937                 break;
          938         }
          939         qunlock(&c->lk);
          940 }
          941 
          942 /*
          943  * set the label associated with a block.
          944  */
          945 Block*
          946 _blockSetLabel(Block *b, Label *l)
          947 {
          948         int lpb;
          949         Block *bb;
          950         u32int a;
          951         Cache *c;
          952 
          953         c = b->c;
          954 
          955         assert(b->part == PartData);
          956         assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
          957         lpb = c->size / LabelSize;
          958         a = b->addr / lpb;
          959         bb = cacheLocal(c, PartLabel, a, OReadWrite);
          960         if(bb == nil){
          961                 blockPut(b);
          962                 return nil;
          963         }
          964         b->l = *l;
          965         labelPack(l, bb->data, b->addr%lpb);
          966         blockDirty(bb);
          967         return bb;
          968 }
          969 
          970 int
          971 blockSetLabel(Block *b, Label *l, int allocating)
          972 {
          973         Block *lb;
          974 
          975         lb = _blockSetLabel(b, l);
          976         if(lb == nil)
          977                 return 0;
          978 
          979         /*
          980          * If we're allocating the block, make sure the label (bl)
          981          * goes to disk before the data block (b) itself.  This is to help
          982          * the blocks that in turn depend on b.
          983          *
          984          * Suppose bx depends on (must be written out after) b.
          985          * Once we write b we'll think it's safe to write bx.
          986          * Bx can't get at b unless it has a valid label, though.
          987          *
          988          * Allocation is the only case in which having a current label
          989          * is vital because:
          990          *
          991          *        - l.type is set at allocation and never changes.
          992          *        - l.tag is set at allocation and never changes.
          993          *        - l.state is not checked when we load blocks.
          994          *        - the archiver cares deeply about l.state being
          995          *                BaActive vs. BaCopied, but that's handled
          996          *                by direct calls to _blockSetLabel.
          997          */
          998 
          999         if(allocating)
         1000                 blockDependency(b, lb, -1, nil, nil);
         1001         blockPut(lb);
         1002         return 1;
         1003 }
         1004 
         1005 /*
         1006  * Record that bb must be written out before b.
         1007  * If index is given, we're about to overwrite the score/e
         1008  * at that index in the block.  Save the old value so we
         1009  * can write a safer ``old'' version of the block if pressed.
         1010  */
         1011 void
         1012 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
         1013 {
         1014         BList *p;
         1015 
         1016         if(bb->iostate == BioClean)
         1017                 return;
         1018 
         1019         /*
         1020          * Dependencies for blocks containing Entry structures
         1021          * or scores must always be explained.  The problem with
         1022          * only explaining some of them is this.  Suppose we have two
         1023          * dependencies for the same field, the first explained
         1024          * and the second not.  We try to write the block when the first
         1025          * dependency is not written but the second is.  We will roll back
         1026          * the first change even though the second trumps it.
         1027          */
         1028         if(index == -1 && bb->part == PartData)
         1029                 assert(b->l.type == BtData);
         1030 
         1031         if(bb->iostate != BioDirty){
         1032                 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
         1033                         argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
         1034                 abort();
         1035         }
         1036 
         1037         p = blistAlloc(bb);
         1038         if(p == nil)
         1039                 return;
         1040 
         1041         assert(bb->iostate == BioDirty);
         1042 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
         1043 
         1044         p->part = bb->part;
         1045         p->addr = bb->addr;
         1046         p->type = bb->l.type;
         1047         p->vers = bb->vers;
         1048         p->index = index;
         1049         if(p->index >= 0){
         1050                 /*
         1051                  * This test would just be b->l.type==BtDir except
         1052                  * we need to exclude the super block.
         1053                  */
         1054                 if(b->l.type == BtDir && b->part == PartData)
         1055                         entryPack(e, p->old.entry, 0);
         1056                 else
         1057                         memmove(p->old.score, score, VtScoreSize);
         1058         }
         1059         p->next = b->prior;
         1060         b->prior = p;
         1061 }
         1062 
         1063 /*
         1064  * Mark an in-memory block as dirty.  If there are too many
         1065  * dirty blocks, start writing some out to disk.
         1066  *
         1067  * If there were way too many dirty blocks, we used to
         1068  * try to do some flushing ourselves, but it's just too dangerous --
         1069  * it implies that the callers cannot have any of our priors locked,
         1070  * but this is hard to avoid in some cases.
         1071  */
         1072 int
         1073 blockDirty(Block *b)
         1074 {
         1075         Cache *c;
         1076 
         1077         c = b->c;
         1078 
         1079         assert(b->part != PartVenti);
         1080 
         1081         if(b->iostate == BioDirty)
         1082                 return 1;
         1083         assert(b->iostate == BioClean || b->iostate == BioLabel);
         1084 
         1085         qlock(&c->lk);
         1086         b->iostate = BioDirty;
         1087         c->ndirty++;
         1088         if(c->ndirty > (c->maxdirty>>1))
         1089                 rwakeup(&c->flush);
         1090         qunlock(&c->lk);
         1091 
         1092         return 1;
         1093 }
         1094 
         1095 /*
         1096  * We've decided to write out b.  Maybe b has some pointers to blocks
         1097  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
         1098  * copy of b that is safe to write out.  (diskThread will make sure the block
         1099  * remains marked as dirty.)
         1100  */
         1101 uchar *
         1102 blockRollback(Block *b, uchar *buf)
         1103 {
         1104         u32int addr;
         1105         BList *p;
         1106         Super super;
         1107 
         1108         /* easy case */
         1109         if(b->prior == nil)
         1110                 return b->data;
         1111 
         1112         memmove(buf, b->data, b->c->size);
         1113         for(p=b->prior; p; p=p->next){
         1114                 /*
         1115                  * we know p->index >= 0 because blockWrite has vetted this block for us.
         1116                  */
         1117                 assert(p->index >= 0);
         1118                 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
         1119                 if(b->part == PartSuper){
         1120                         assert(p->index == 0);
         1121                         superUnpack(&super, buf);
         1122                         addr = globalToLocal(p->old.score);
         1123                         if(addr == NilBlock){
         1124                                 fprint(2, "%s: rolling back super block: "
         1125                                         "bad replacement addr %V\n",
         1126                                         argv0, p->old.score);
         1127                                 abort();
         1128                         }
         1129                         super.active = addr;
         1130                         superPack(&super, buf);
         1131                         continue;
         1132                 }
         1133                 if(b->l.type == BtDir)
         1134                         memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
         1135                 else
         1136                         memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
         1137         }
         1138         return buf;
         1139 }
         1140 
         1141 /*
         1142  * Try to write block b.
         1143  * If b depends on other blocks:
         1144  *
         1145  *        If the block has been written out, remove the dependency.
         1146  *        If the dependency is replaced by a more recent dependency,
         1147  *                throw it out.
         1148  *        If we know how to write out an old version of b that doesn't
         1149  *                depend on it, do that.
         1150  *
         1151  *        Otherwise, bail.
         1152  */
         1153 int
         1154 blockWrite(Block *b, int waitlock)
         1155 {
         1156         uchar *dmap;
         1157         Cache *c;
         1158         BList *p, **pp;
         1159         Block *bb;
         1160         int lockfail;
         1161 
         1162         c = b->c;
         1163 
         1164         if(b->iostate != BioDirty)
         1165                 return 1;
         1166 
         1167         dmap = b->dmap;
         1168         memset(dmap, 0, c->ndmap);
         1169         pp = &b->prior;
         1170         for(p=*pp; p; p=*pp){
         1171                 if(p->index >= 0){
         1172                         /* more recent dependency has succeeded; this one can go */
         1173                         if(dmap[p->index/8] & (1<<(p->index%8)))
         1174                                 goto ignblock;
         1175                 }
         1176 
         1177                 lockfail = 0;
         1178                 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
         1179                         &lockfail);
         1180                 if(bb == nil){
         1181                         if(lockfail)
         1182                                 return 0;
         1183                         /* block not in cache => was written already */
         1184                         dmap[p->index/8] |= 1<<(p->index%8);
         1185                         goto ignblock;
         1186                 }
         1187 
         1188                 /*
         1189                  * same version of block is still in cache.
         1190                  *
         1191                  * the assertion is true because the block still has version p->vers,
         1192                  * which means it hasn't been written out since we last saw it.
         1193                  */
         1194                 if(bb->iostate != BioDirty){
         1195                         fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
         1196                                 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
         1197                         /* probably BioWriting if it happens? */
         1198                         if(bb->iostate == BioClean)
         1199                                 goto ignblock;
         1200                 }
         1201 
         1202                 blockPut(bb);
         1203 
         1204                 if(p->index < 0){
         1205                         /*
         1206                          * We don't know how to temporarily undo
         1207                          * b's dependency on bb, so just don't write b yet.
         1208                          */
         1209                         if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
         1210                                 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
         1211                         return 0;
         1212                 }
         1213                 /* keep walking down the list */
         1214                 pp = &p->next;
         1215                 continue;
         1216 
         1217 ignblock:
         1218                 *pp = p->next;
         1219                 blistFree(c, p);
         1220                 continue;
         1221         }
         1222 
         1223         /*
         1224          * DiskWrite must never be called with a double-locked block.
         1225          * This call to diskWrite is okay because blockWrite is only called
         1226          * from the cache flush thread, which never double-locks a block.
         1227          */
         1228         diskWrite(c->disk, b);
         1229         return 1;
         1230 }
         1231 
         1232 /*
         1233  * Change the I/O state of block b.
         1234  * Just an assignment except for magic in
         1235  * switch statement (read comments there).
         1236  */
         1237 void
         1238 blockSetIOState(Block *b, int iostate)
         1239 {
         1240         int dowakeup;
         1241         Cache *c;
         1242         BList *p, *q;
         1243 
         1244 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
         1245 
         1246         c = b->c;
         1247 
         1248         dowakeup = 0;
         1249         switch(iostate){
         1250         default:
         1251                 abort();
         1252         case BioEmpty:
         1253                 assert(!b->uhead);
         1254                 break;
         1255         case BioLabel:
         1256                 assert(!b->uhead);
         1257                 break;
         1258         case BioClean:
         1259                 bwatchDependency(b);
         1260                 /*
         1261                  * If b->prior is set, it means a write just finished.
         1262                  * The prior list isn't needed anymore.
         1263                  */
         1264                 for(p=b->prior; p; p=q){
         1265                         q = p->next;
         1266                         blistFree(c, p);
         1267                 }
         1268                 b->prior = nil;
         1269                 /*
         1270                  * Freeing a block or just finished a write.
         1271                  * Move the blocks from the per-block unlink
         1272                  * queue to the cache unlink queue.
         1273                  */
         1274                 if(b->iostate == BioDirty || b->iostate == BioWriting){
         1275                         qlock(&c->lk);
         1276                         c->ndirty--;
         1277                         b->iostate = iostate;        /* change here to keep in sync with ndirty */
         1278                         b->vers = c->vers++;
         1279                         if(b->uhead){
         1280                                 /* add unlink blocks to unlink queue */
         1281                                 if(c->uhead == nil){
         1282                                         c->uhead = b->uhead;
         1283                                         rwakeup(&c->unlink);
         1284                                 }else
         1285                                         c->utail->next = b->uhead;
         1286                                 c->utail = b->utail;
         1287                                 b->uhead = nil;
         1288                         }
         1289                         qunlock(&c->lk);
         1290                 }
         1291                 assert(!b->uhead);
         1292                 dowakeup = 1;
         1293                 break;
         1294         case BioDirty:
         1295                 /*
         1296                  * Wrote out an old version of the block (see blockRollback).
         1297                  * Bump a version count, leave it dirty.
         1298                  */
         1299                 if(b->iostate == BioWriting){
         1300                         qlock(&c->lk);
         1301                         b->vers = c->vers++;
         1302                         qunlock(&c->lk);
         1303                         dowakeup = 1;
         1304                 }
         1305                 break;
         1306         case BioReading:
         1307         case BioWriting:
         1308                 /*
         1309                  * Adding block to disk queue.  Bump reference count.
         1310                  * diskThread decs the count later by calling blockPut.
         1311                  * This is here because we need to lock c->lk to
         1312                  * manipulate the ref count.
         1313                  */
         1314                 qlock(&c->lk);
         1315                 b->ref++;
         1316                 qunlock(&c->lk);
         1317                 break;
         1318         case BioReadError:
         1319         case BioVentiError:
         1320                 /*
         1321                  * Oops.
         1322                  */
         1323                 dowakeup = 1;
         1324                 break;
         1325         }
         1326         b->iostate = iostate;
         1327         /*
         1328          * Now that the state has changed, we can wake the waiters.
         1329          */
         1330         if(dowakeup)
         1331                 rwakeupall(&b->ioready);
         1332 }
         1333 
         1334 /*
         1335  * The active file system is a tree of blocks.
         1336  * When we add snapshots to the mix, the entire file system
         1337  * becomes a dag and thus requires a bit more care.
         1338  *
         1339  * The life of the file system is divided into epochs.  A snapshot
         1340  * ends one epoch and begins the next.  Each file system block
         1341  * is marked with the epoch in which it was created (b.epoch).
         1342  * When the block is unlinked from the file system (closed), it is marked
         1343  * with the epoch in which it was removed (b.epochClose).
         1344  * Once we have discarded or archived all snapshots up to
         1345  * b.epochClose, we can reclaim the block.
         1346  *
         1347  * If a block was created in a past epoch but is not yet closed,
         1348  * it is treated as copy-on-write.  Of course, in order to insert the
         1349  * new pointer into the tree, the parent must be made writable,
         1350  * and so on up the tree.  The recursion stops because the root
         1351  * block is always writable.
         1352  *
         1353  * If blocks are never closed, they will never be reused, and
         1354  * we will run out of disk space.  But marking a block as closed
         1355  * requires some care about dependencies and write orderings.
         1356  *
         1357  * (1) If a block p points at a copy-on-write block b and we
         1358  * copy b to create bb, then p must be written out after bb and
         1359  * lbb (bb's label block).
         1360  *
         1361  * (2) We have to mark b as closed, but only after we switch
         1362  * the pointer, so lb must be written out after p.  In fact, we
         1363  * can't even update the in-memory copy, or the cache might
         1364  * mistakenly give out b for reuse before p gets written.
         1365  *
         1366  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
         1367  * The caller is expected to record a "p after bb" dependency
         1368  * to finish (1), and also expected to call blockRemoveLink
         1369  * to arrange for (2) to happen once p is written.
         1370  *
         1371  * Until (2) happens, some pieces of the code (e.g., the archiver)
         1372  * still need to know whether a block has been copied, so we
         1373  * set the BsCopied bit in the label and force that to disk *before*
         1374  * the copy gets written out.
         1375  */
         1376 Block*
         1377 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
         1378 {
         1379         Block *bb, *lb;
         1380         Label l;
         1381 
         1382         if((b->l.state&BsClosed) || b->l.epoch >= ehi)
         1383                 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
         1384                         argv0, b->addr, &b->l, elo, ehi);
         1385 
         1386         bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
         1387         if(bb == nil){
         1388                 blockPut(b);
         1389                 return nil;
         1390         }
         1391 
         1392         /*
         1393          * Update label so we know the block has been copied.
         1394          * (It will be marked closed once it has been unlinked from
         1395          * the tree.)  This must follow cacheAllocBlock since we
         1396          * can't be holding onto lb when we call cacheAllocBlock.
         1397          */
         1398         if((b->l.state&BsCopied)==0)
         1399         if(b->part == PartData){        /* not the superblock */
         1400                 l = b->l;
         1401                 l.state |= BsCopied;
         1402                 lb = _blockSetLabel(b, &l);
         1403                 if(lb == nil){
         1404                         /* can't set label => can't copy block */
         1405                         blockPut(b);
         1406                         l.type = BtMax;
         1407                         l.state = BsFree;
         1408                         l.epoch = 0;
         1409                         l.epochClose = 0;
         1410                         l.tag = 0;
         1411                         blockSetLabel(bb, &l, 0);
         1412                         blockPut(bb);
         1413                         return nil;
         1414                 }
         1415                 blockDependency(bb, lb, -1, nil, nil);
         1416                 blockPut(lb);
         1417         }
         1418 
         1419         memmove(bb->data, b->data, b->c->size);
         1420         blockDirty(bb);
         1421         blockPut(b);
         1422         return bb;
         1423 }
         1424 
         1425 /*
         1426  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
         1427  * If recurse is set, we are unlinking all of bb's children as well.
         1428  *
         1429  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
         1430  * the relevant information to b's list of unlinked blocks.  Once b is written,
         1431  * the list will be queued for processing.
         1432  *
         1433  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
         1434  */
         1435 void
         1436 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
         1437 {
         1438         BList *p, **pp, bl;
         1439 
         1440         /* remove bb from prior list */
         1441         for(pp=&b->prior; (p=*pp)!=nil; ){
         1442                 if(p->part == PartData && p->addr == addr){
         1443                         *pp = p->next;
         1444                         blistFree(b->c, p);
         1445                 }else
         1446                         pp = &p->next;
         1447         }
         1448 
         1449         bl.part = PartData;
         1450         bl.addr = addr;
         1451         bl.type = type;
         1452         bl.tag = tag;
         1453         if(b->l.epoch == 0)
         1454                 assert(b->part == PartSuper);
         1455         bl.epoch = b->l.epoch;
         1456         bl.next = nil;
         1457         bl.recurse = recurse;
         1458 
         1459         if(b->part == PartSuper && b->iostate == BioClean)
         1460                 p = nil;
         1461         else
         1462                 p = blistAlloc(b);
         1463         if(p == nil){
         1464                 /*
         1465                  * b has already been written to disk.
         1466                  */
         1467                 doRemoveLink(b->c, &bl);
         1468                 return;
         1469         }
         1470 
         1471         /* Uhead is only processed when the block goes from Dirty -> Clean */
         1472         assert(b->iostate == BioDirty);
         1473 
         1474         *p = bl;
         1475         if(b->uhead == nil)
         1476                 b->uhead = p;
         1477         else
         1478                 b->utail->next = p;
         1479         b->utail = p;
         1480 }
         1481 
         1482 /*
         1483  * Process removal of a single block and perhaps its children.
         1484  */
         1485 static void
         1486 doRemoveLink(Cache *c, BList *p)
         1487 {
         1488         int i, n, recurse;
         1489         u32int a;
         1490         Block *b;
         1491         Label l;
         1492         BList bl;
         1493 
         1494         recurse = (p->recurse && p->type != BtData && p->type != BtDir);
         1495 
         1496         /*
         1497          * We're not really going to overwrite b, but if we're not
         1498          * going to look at its contents, there is no point in reading
         1499          * them from the disk.
         1500          */
         1501         b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
         1502         if(b == nil)
         1503                 return;
         1504 
         1505         /*
         1506          * When we're unlinking from the superblock, close with the next epoch.
         1507          */
         1508         if(p->epoch == 0)
         1509                 p->epoch = b->l.epoch+1;
         1510 
         1511         /* sanity check */
         1512         if(b->l.epoch > p->epoch){
         1513                 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
         1514                         argv0, b->l.epoch, p->epoch);
         1515                 blockPut(b);
         1516                 return;
         1517         }
         1518 
         1519         if(recurse){
         1520                 n = c->size / VtScoreSize;
         1521                 for(i=0; i<n; i++){
         1522                         a = globalToLocal(b->data + i*VtScoreSize);
         1523                         if(a == NilBlock || !readLabel(c, &l, a))
         1524                                 continue;
         1525                         if(l.state&BsClosed)
         1526                                 continue;
         1527                         /*
         1528                          * If stack space becomes an issue...
         1529                         p->addr = a;
         1530                         p->type = l.type;
         1531                         p->tag = l.tag;
         1532                         doRemoveLink(c, p);
         1533                          */
         1534 
         1535                         bl.part = PartData;
         1536                         bl.addr = a;
         1537                         bl.type = l.type;
         1538                         bl.tag = l.tag;
         1539                         bl.epoch = p->epoch;
         1540                         bl.next = nil;
         1541                         bl.recurse = 1;
         1542                         /* give up the block lock - share with others */
         1543                         blockPut(b);
         1544                         doRemoveLink(c, &bl);
         1545                         b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
         1546                         if(b == nil){
         1547                                 fprint(2, "%s: warning: lost block in doRemoveLink\n",
         1548                                         argv0);
         1549                                 return;
         1550                         }
         1551                 }
         1552         }
         1553 
         1554         l = b->l;
         1555         l.state |= BsClosed;
         1556         l.epochClose = p->epoch;
         1557         if(l.epochClose == l.epoch){
         1558                 qlock(&c->fl->lk);
         1559                 if(l.epoch == c->fl->epochLow)
         1560                         c->fl->nused--;
         1561                 blockSetLabel(b, &l, 0);
         1562                 qunlock(&c->fl->lk);
         1563         }else
         1564                 blockSetLabel(b, &l, 0);
         1565         blockPut(b);
         1566 }
         1567 
         1568 /*
         1569  * Allocate a BList so that we can record a dependency
         1570  * or queue a removal related to block b.
         1571  * If we can't find a BList, we write out b and return nil.
         1572  */
         1573 static BList *
         1574 blistAlloc(Block *b)
         1575 {
         1576         Cache *c;
         1577         BList *p;
         1578 
         1579         if(b->iostate != BioDirty){
         1580                 /*
         1581                  * should not happen anymore -
         1582                   * blockDirty used to flush but no longer does.
         1583                  */
         1584                 assert(b->iostate == BioClean);
         1585                 fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
         1586                 return nil;
         1587         }
         1588 
         1589         c = b->c;
         1590         qlock(&c->lk);
         1591         if(c->blfree == nil){
         1592                 /*
         1593                  * No free BLists.  What are our options?
         1594                  */
         1595 
         1596                 /* Block has no priors? Just write it. */
         1597                 if(b->prior == nil){
         1598                         qunlock(&c->lk);
         1599                         diskWriteAndWait(c->disk, b);
         1600                         return nil;
         1601                 }
         1602 
         1603                 /*
         1604                  * Wake the flush thread, which will hopefully free up
         1605                  * some BLists for us.  We used to flush a block from
         1606                  * our own prior list and reclaim that BList, but this is
         1607                  * a no-no: some of the blocks on our prior list may
         1608                  * be locked by our caller.  Or maybe their label blocks
         1609                  * are locked by our caller.  In any event, it's too hard
         1610                  * to make sure we can do I/O for ourselves.  Instead,
         1611                  * we assume the flush thread will find something.
         1612                  * (The flush thread never blocks waiting for a block,
         1613                  * so it can't deadlock like we can.)
         1614                  */
         1615                 while(c->blfree == nil){
         1616                         rwakeup(&c->flush);
         1617                         rsleep(&c->blrend);
         1618                         if(c->blfree == nil)
         1619                                 fprint(2, "%s: flushing for blists\n", argv0);
         1620                 }
         1621         }
         1622 
         1623         p = c->blfree;
         1624         c->blfree = p->next;
         1625         qunlock(&c->lk);
         1626         return p;
         1627 }
         1628 
         1629 static void
         1630 blistFree(Cache *c, BList *bl)
         1631 {
         1632         qlock(&c->lk);
         1633         bl->next = c->blfree;
         1634         c->blfree = bl;
         1635         rwakeup(&c->blrend);
         1636         qunlock(&c->lk);
         1637 }
         1638 
         1639 char*
         1640 bsStr(int state)
         1641 {
         1642         static char s[100];
         1643 
         1644         if(state == BsFree)
         1645                 return "Free";
         1646         if(state == BsBad)
         1647                 return "Bad";
         1648 
         1649         sprint(s, "%x", state);
         1650         if(!(state&BsAlloc))
         1651                 strcat(s, ",Free");        /* should not happen */
         1652         if(state&BsCopied)
         1653                 strcat(s, ",Copied");
         1654         if(state&BsVenti)
         1655                 strcat(s, ",Venti");
         1656         if(state&BsClosed)
         1657                 strcat(s, ",Closed");
         1658         return s;
         1659 }
         1660 
         1661 char *
         1662 bioStr(int iostate)
         1663 {
         1664         switch(iostate){
         1665         default:
         1666                 return "Unknown!!";
         1667         case BioEmpty:
         1668                 return "Empty";
         1669         case BioLabel:
         1670                 return "Label";
         1671         case BioClean:
         1672                 return "Clean";
         1673         case BioDirty:
         1674                 return "Dirty";
         1675         case BioReading:
         1676                 return "Reading";
         1677         case BioWriting:
         1678                 return "Writing";
         1679         case BioReadError:
         1680                 return "ReadError";
         1681         case BioVentiError:
         1682                 return "VentiError";
         1683         case BioMax:
         1684                 return "Max";
         1685         }
         1686 }
         1687 
         1688 static char *bttab[] = {
         1689         "BtData",
         1690         "BtData+1",
         1691         "BtData+2",
         1692         "BtData+3",
         1693         "BtData+4",
         1694         "BtData+5",
         1695         "BtData+6",
         1696         "BtData+7",
         1697         "BtDir",
         1698         "BtDir+1",
         1699         "BtDir+2",
         1700         "BtDir+3",
         1701         "BtDir+4",
         1702         "BtDir+5",
         1703         "BtDir+6",
         1704         "BtDir+7",
         1705 };
         1706 
         1707 char*
         1708 btStr(int type)
         1709 {
         1710         if(type < nelem(bttab))
         1711                 return bttab[type];
         1712         return "unknown";
         1713 }
         1714 
         1715 int
         1716 labelFmt(Fmt *f)
         1717 {
         1718         Label *l;
         1719 
         1720         l = va_arg(f->args, Label*);
         1721         return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
         1722                 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
         1723 }
         1724 
         1725 int
         1726 scoreFmt(Fmt *f)
         1727 {
         1728         uchar *v;
         1729         int i;
         1730         u32int addr;
         1731 
         1732         v = va_arg(f->args, uchar*);
         1733         if(v == nil){
         1734                 fmtprint(f, "*");
         1735         }else if((addr = globalToLocal(v)) != NilBlock)
         1736                 fmtprint(f, "0x%.8ux", addr);
         1737         else{
         1738                 for(i = 0; i < VtScoreSize; i++)
         1739                         fmtprint(f, "%2.2ux", v[i]);
         1740         }
         1741 
         1742         return 0;
         1743 }
         1744 
         1745 static int
         1746 upHeap(int i, Block *b)
         1747 {
         1748         Block *bb;
         1749         u32int now;
         1750         int p;
         1751         Cache *c;
         1752 
         1753         c = b->c;
         1754         now = c->now;
         1755         for(; i != 0; i = p){
         1756                 p = (i - 1) >> 1;
         1757                 bb = c->heap[p];
         1758                 if(b->used - now >= bb->used - now)
         1759                         break;
         1760                 c->heap[i] = bb;
         1761                 bb->heap = i;
         1762         }
         1763         c->heap[i] = b;
         1764         b->heap = i;
         1765 
         1766         return i;
         1767 }
         1768 
         1769 static int
         1770 downHeap(int i, Block *b)
         1771 {
         1772         Block *bb;
         1773         u32int now;
         1774         int k;
         1775         Cache *c;
         1776 
         1777         c = b->c;
         1778         now = c->now;
         1779         for(; ; i = k){
         1780                 k = (i << 1) + 1;
         1781                 if(k >= c->nheap)
         1782                         break;
         1783                 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
         1784                         k++;
         1785                 bb = c->heap[k];
         1786                 if(b->used - now <= bb->used - now)
         1787                         break;
         1788                 c->heap[i] = bb;
         1789                 bb->heap = i;
         1790         }
         1791         c->heap[i] = b;
         1792         b->heap = i;
         1793         return i;
         1794 }
         1795 
         1796 /*
         1797  * Delete a block from the heap.
         1798  * Called with c->lk held.
         1799  */
         1800 static void
         1801 heapDel(Block *b)
         1802 {
         1803         int i, si;
         1804         Cache *c;
         1805 
         1806         c = b->c;
         1807 
         1808         si = b->heap;
         1809         if(si == BadHeap)
         1810                 return;
         1811         b->heap = BadHeap;
         1812         c->nheap--;
         1813         if(si == c->nheap)
         1814                 return;
         1815         b = c->heap[c->nheap];
         1816         i = upHeap(si, b);
         1817         if(i == si)
         1818                 downHeap(i, b);
         1819 }
         1820 
         1821 /*
         1822  * Insert a block into the heap.
         1823  * Called with c->lk held.
         1824  */
         1825 static void
         1826 heapIns(Block *b)
         1827 {
         1828         assert(b->heap == BadHeap);
         1829         upHeap(b->c->nheap++, b);
         1830         rwakeup(&b->c->heapwait);
         1831 }
         1832 
         1833 /*
         1834  * Get just the label for a block.
         1835  */
         1836 int
         1837 readLabel(Cache *c, Label *l, u32int addr)
         1838 {
         1839         int lpb;
         1840         Block *b;
         1841         u32int a;
         1842 
         1843         lpb = c->size / LabelSize;
         1844         a = addr / lpb;
         1845         b = cacheLocal(c, PartLabel, a, OReadOnly);
         1846         if(b == nil){
         1847                 blockPut(b);
         1848                 return 0;
         1849         }
         1850 
         1851         if(!labelUnpack(l, b->data, addr%lpb)){
         1852                 blockPut(b);
         1853                 return 0;
         1854         }
         1855         blockPut(b);
         1856         return 1;
         1857 }
         1858 
         1859 /*
         1860  * Process unlink queue.
         1861  * Called with c->lk held.
         1862  */
         1863 static void
         1864 unlinkBody(Cache *c)
         1865 {
         1866         BList *p;
         1867 
         1868         while(c->uhead != nil){
         1869                 p = c->uhead;
         1870                 c->uhead = p->next;
         1871                 qunlock(&c->lk);
         1872                 doRemoveLink(c, p);
         1873                 qlock(&c->lk);
         1874                 p->next = c->blfree;
         1875                 c->blfree = p;
         1876         }
         1877 }
         1878 
         1879 /*
         1880  * Occasionally unlink the blocks on the cache unlink queue.
         1881  */
         1882 static void
         1883 unlinkThread(void *a)
         1884 {
         1885         Cache *c = a;
         1886 
         1887         threadsetname("unlink");
         1888 
         1889         qlock(&c->lk);
         1890         for(;;){
         1891                 while(c->uhead == nil && c->die.l == nil)
         1892                         rsleep(&c->unlink);
         1893                 if(c->die.l != nil)
         1894                         break;
         1895                 unlinkBody(c);
         1896         }
         1897         c->ref--;
         1898         rwakeup(&c->die);
         1899         qunlock(&c->lk);
         1900 }
         1901 
         1902 static int
         1903 baddrCmp(const void *a0, const void *a1)
         1904 {
         1905         BAddr *b0, *b1;
         1906         b0 = (BAddr*)a0;
         1907         b1 = (BAddr*)a1;
         1908 
         1909         if(b0->part < b1->part)
         1910                 return -1;
         1911         if(b0->part > b1->part)
         1912                 return 1;
         1913         if(b0->addr < b1->addr)
         1914                 return -1;
         1915         if(b0->addr > b1->addr)
         1916                 return 1;
         1917         return 0;
         1918 }
         1919 
         1920 /*
         1921  * Scan the block list for dirty blocks; add them to the list c->baddr.
         1922  */
         1923 static void
         1924 flushFill(Cache *c)
         1925 {
         1926         int i, ndirty;
         1927         BAddr *p;
         1928         Block *b;
         1929 
         1930         qlock(&c->lk);
         1931         if(c->ndirty == 0){
         1932                 qunlock(&c->lk);
         1933                 return;
         1934         }
         1935 
         1936         p = c->baddr;
         1937         ndirty = 0;
         1938         for(i=0; i<c->nblocks; i++){
         1939                 b = c->blocks + i;
         1940                 if(b->part == PartError)
         1941                         continue;
         1942                 if(b->iostate == BioDirty || b->iostate == BioWriting)
         1943                         ndirty++;
         1944                 if(b->iostate != BioDirty)
         1945                         continue;
         1946                 p->part = b->part;
         1947                 p->addr = b->addr;
         1948                 p->vers = b->vers;
         1949                 p++;
         1950         }
         1951         if(ndirty != c->ndirty){
         1952                 fprint(2, "%s: ndirty mismatch expected %d found %d\n",
         1953                         argv0, c->ndirty, ndirty);
         1954                 c->ndirty = ndirty;
         1955         }
         1956         qunlock(&c->lk);
         1957 
         1958         c->bw = p - c->baddr;
         1959         qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
         1960 }
         1961 
         1962 /*
         1963  * This is not thread safe, i.e. it can't be called from multiple threads.
         1964  *
         1965  * It's okay how we use it, because it only gets called in
         1966  * the flushThread.  And cacheFree, but only after
         1967  * cacheFree has killed off the flushThread.
         1968  */
         1969 static int
         1970 cacheFlushBlock(Cache *c)
         1971 {
         1972         Block *b;
         1973         BAddr *p;
         1974         int lockfail, nfail;
         1975 
         1976         nfail = 0;
         1977         for(;;){
         1978                 if(c->br == c->be){
         1979                         if(c->bw == 0 || c->bw == c->be)
         1980                                 flushFill(c);
         1981                         c->br = 0;
         1982                         c->be = c->bw;
         1983                         c->bw = 0;
         1984                         c->nflush = 0;
         1985                 }
         1986 
         1987                 if(c->br == c->be)
         1988                         return 0;
         1989                 p = c->baddr + c->br;
         1990                 c->br++;
         1991                 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
         1992                         &lockfail);
         1993 
         1994                 if(b && blockWrite(b, Nowaitlock)){
         1995                         c->nflush++;
         1996                         blockPut(b);
         1997                         return 1;
         1998                 }
         1999                 if(b)
         2000                         blockPut(b);
         2001 
         2002                 /*
         2003                  * Why didn't we write the block?
         2004                  */
         2005 
         2006                 /* Block already written out */
         2007                 if(b == nil && !lockfail)
         2008                         continue;
         2009 
         2010                 /* Failed to acquire lock; sleep if happens a lot. */
         2011                 if(lockfail && ++nfail > 100){
         2012                         sleep(500);
         2013                         nfail = 0;
         2014                 }
         2015                 /* Requeue block. */
         2016                 if(c->bw < c->be)
         2017                         c->baddr[c->bw++] = *p;
         2018         }
         2019 }
         2020 
         2021 /*
         2022  * Occasionally flush dirty blocks from memory to the disk.
         2023  */
         2024 static void
         2025 flushThread(void *a)
         2026 {
         2027         Cache *c = a;
         2028         int i;
         2029 
         2030         threadsetname("flush");
         2031         qlock(&c->lk);
         2032         while(c->die.l == nil){
         2033                 rsleep(&c->flush);
         2034                 qunlock(&c->lk);
         2035                 for(i=0; i<FlushSize; i++)
         2036                         if(!cacheFlushBlock(c)){
         2037                                 /*
         2038                                  * If i==0, could be someone is waking us repeatedly
         2039                                  * to flush the cache but there's no work to do.
         2040                                  * Pause a little.
         2041                                  */
         2042                                 if(i==0){
         2043                                         // fprint(2, "%s: flushthread found "
         2044                                         //        "nothing to flush - %d dirty\n",
         2045                                         //        argv0, c->ndirty);
         2046                                         sleep(250);
         2047                                 }
         2048                                 break;
         2049                         }
         2050                 if(i==0 && c->ndirty){
         2051                         /*
         2052                          * All the blocks are being written right now -- there's nothing to do.
         2053                          * We might be spinning with cacheFlush though -- he'll just keep
         2054                          * kicking us until c->ndirty goes down.  Probably we should sleep
         2055                          * on something that the diskThread can kick, but for now we'll
         2056                          * just pause for a little while waiting for disks to finish.
         2057                          */
         2058                         sleep(100);
         2059                 }
         2060                 qlock(&c->lk);
         2061                 rwakeupall(&c->flushwait);
         2062         }
         2063         c->ref--;
         2064         rwakeup(&c->die);
         2065         qunlock(&c->lk);
         2066 }
         2067 
         2068 /*
         2069  * Flush the cache.
         2070  */
         2071 void
         2072 cacheFlush(Cache *c, int wait)
         2073 {
         2074         qlock(&c->lk);
         2075         if(wait){
         2076                 while(c->ndirty){
         2077                 //        consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
         2078                 //                c->ndirty, c->uhead);
         2079                         rwakeup(&c->flush);
         2080                         rsleep(&c->flushwait);
         2081                 }
         2082         //        consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
         2083         }else if(c->ndirty)
         2084                 rwakeup(&c->flush);
         2085         qunlock(&c->lk);
         2086 }
         2087 
         2088 /*
         2089  * Kick the flushThread every 30 seconds.
         2090  */
         2091 static void
         2092 cacheSync(void *v)
         2093 {
         2094         Cache *c;
         2095 
         2096         c = v;
         2097         cacheFlush(c, 0);
         2098 }