tcache.c - plan9port - [fork] Plan 9 from user space
HTML git clone git://src.adamsgaard.dk/plan9port
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
tcache.c (43774B)
---
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5
6 #include "9.h" /* for cacheFlush */
7
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10
11 enum {
12 BadHeap = ~0,
13 };
14
15 /*
16 * Store data to the memory cache in c->size blocks
17 * with the block zero extended to fill it out. When writing to
18 * Venti, the block will be zero truncated. The walker will also check
19 * that the block fits within psize or dsize as the case may be.
20 */
21
22 struct Cache
23 {
24 QLock lk;
25 int ref;
26 int mode;
27
28 Disk *disk;
29 int size; /* block size */
30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */
31 VtConn *z;
32 u32int now; /* ticks for usage timestamps */
33 Block **heads; /* hash table for finding address */
34 int nheap; /* number of available victims */
35 Block **heap; /* heap for locating victims */
36 long nblocks; /* number of blocks allocated */
37 Block *blocks; /* array of block descriptors */
38 u8int *mem; /* memory for all block data & blists */
39
40 BList *blfree;
41 Rendez blrend;
42
43 int ndirty; /* number of dirty blocks in the cache */
44 int maxdirty; /* max number of dirty blocks */
45 u32int vers;
46
47 long hashSize;
48
49 FreeList *fl;
50
51 Rendez die; /* daemon threads should die when QLock != nil */
52
53 Rendez flush;
54 Rendez flushwait;
55 Rendez heapwait;
56 BAddr *baddr;
57 int bw, br, be;
58 int nflush;
59
60 Periodic *sync;
61
62 /* unlink daemon */
63 BList *uhead;
64 BList *utail;
65 Rendez unlink;
66
67 /* block counts */
68 int nused;
69 int ndisk;
70 };
71
72 struct BList {
73 int part;
74 u32int addr;
75 uchar type;
76 u32int tag;
77 u32int epoch;
78 u32int vers;
79
80 int recurse; /* for block unlink */
81
82 /* for roll back */
83 int index; /* -1 indicates not valid */
84 union {
85 uchar score[VtScoreSize];
86 uchar entry[VtEntrySize];
87 } old;
88 BList *next;
89 };
90
91 struct BAddr {
92 int part;
93 u32int addr;
94 u32int vers;
95 };
96
97 struct FreeList {
98 QLock lk;
99 u32int last; /* last block allocated */
100 u32int end; /* end of data partition */
101 u32int nused; /* number of used blocks */
102 u32int epochLow; /* low epoch when last updated nused */
103 };
104
105 static FreeList *flAlloc(u32int end);
106 static void flFree(FreeList *fl);
107
108 static Block *cacheBumpBlock(Cache *c);
109 static void heapDel(Block*);
110 static void heapIns(Block*);
111 static void cacheCheck(Cache*);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void unlinkBody(Cache *c);
115 static int cacheFlushBlock(Cache *c);
116 static void cacheSync(void*);
117 static BList *blistAlloc(Block*);
118 static void blistFree(Cache*, BList*);
119 static void doRemoveLink(Cache*, BList*);
120
121 /*
122 * Mapping from local block type to Venti type
123 */
124 int vtType[BtMax] = {
125 VtDataType, /* BtData | 0 */
126 VtDataType+1, /* BtData | 1 */
127 VtDataType+2, /* BtData | 2 */
128 VtDataType+3, /* BtData | 3 */
129 VtDataType+4, /* BtData | 4 */
130 VtDataType+5, /* BtData | 5 */
131 VtDataType+6, /* BtData | 6 */
132 VtDataType+7, /* BtData | 7 */
133 VtDirType, /* BtDir | 0 */
134 VtDirType+1, /* BtDir | 1 */
135 VtDirType+2, /* BtDir | 2 */
136 VtDirType+3, /* BtDir | 3 */
137 VtDirType+4, /* BtDir | 4 */
138 VtDirType+5, /* BtDir | 5 */
139 VtDirType+6, /* BtDir | 6 */
140 VtDirType+7, /* BtDir | 7 */
141 };
142
143 /*
144 * Allocate the memory cache.
145 */
146 Cache *
147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode)
148 {
149 int i;
150 Cache *c;
151 Block *b;
152 BList *bl;
153 u8int *p;
154 int nbl;
155
156 c = vtmallocz(sizeof(Cache));
157
158 /* reasonable number of BList elements */
159 nbl = nblocks * 4;
160
161 c->ref = 1;
162 c->disk = disk;
163 c->z = z;
164 c->size = diskBlockSize(disk);
165 bwatchSetBlockSize(c->size);
166 /* round c->size up to be a nice multiple */
167 c->size = (c->size + 127) & ~127;
168 c->ndmap = (c->size/20 + 7) / 8;
169 c->nblocks = nblocks;
170 c->hashSize = nblocks;
171 c->heads = vtmallocz(c->hashSize*sizeof(Block*));
172 c->heap = vtmallocz(nblocks*sizeof(Block*));
173 c->blocks = vtmallocz(nblocks*sizeof(Block));
174 c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
175 c->baddr = vtmallocz(nblocks * sizeof(BAddr));
176 c->mode = mode;
177 c->vers++;
178 p = c->mem;
179 for(i = 0; i < nblocks; i++){
180 b = &c->blocks[i];
181 b->c = c;
182 b->data = p;
183 b->heap = i;
184 b->ioready.l = &b->lk;
185 c->heap[i] = b;
186 p += c->size;
187 }
188 c->nheap = nblocks;
189 for(i = 0; i < nbl; i++){
190 bl = (BList*)p;
191 bl->next = c->blfree;
192 c->blfree = bl;
193 p += sizeof(BList);
194 }
195 /* separate loop to keep blocks and blists reasonably aligned */
196 for(i = 0; i < nblocks; i++){
197 b = &c->blocks[i];
198 b->dmap = p;
199 p += c->ndmap;
200 }
201
202 c->blrend.l = &c->lk;
203
204 c->maxdirty = nblocks*(DirtyPercentage*0.01);
205
206 c->fl = flAlloc(diskSize(disk, PartData));
207
208 c->unlink.l = &c->lk;
209 c->flush.l = &c->lk;
210 c->flushwait.l = &c->lk;
211 c->heapwait.l = &c->lk;
212 c->sync = periodicAlloc(cacheSync, c, 30*1000);
213
214 if(mode == OReadWrite){
215 c->ref += 2;
216 proccreate(unlinkThread, c, STACK);
217 proccreate(flushThread, c, STACK);
218 }
219 cacheCheck(c);
220
221 return c;
222 }
223
224 /*
225 * Free the whole memory cache, flushing all dirty blocks to the disk.
226 */
227 void
228 cacheFree(Cache *c)
229 {
230 int i;
231
232 /* kill off daemon threads */
233 qlock(&c->lk);
234 c->die.l = &c->lk;
235 periodicKill(c->sync);
236 rwakeup(&c->flush);
237 rwakeup(&c->unlink);
238 while(c->ref > 1)
239 rsleep(&c->die);
240
241 /* flush everything out */
242 do {
243 unlinkBody(c);
244 qunlock(&c->lk);
245 while(cacheFlushBlock(c))
246 ;
247 diskFlush(c->disk);
248 qlock(&c->lk);
249 } while(c->uhead || c->ndirty);
250 qunlock(&c->lk);
251
252 cacheCheck(c);
253
254 for(i = 0; i < c->nblocks; i++){
255 assert(c->blocks[i].ref == 0);
256 }
257 flFree(c->fl);
258 vtfree(c->baddr);
259 vtfree(c->heads);
260 vtfree(c->blocks);
261 vtfree(c->mem);
262 diskFree(c->disk);
263 /* don't close vtSession */
264 vtfree(c);
265 }
266
267 static void
268 cacheDump(Cache *c)
269 {
270 int i;
271 Block *b;
272
273 for(i = 0; i < c->nblocks; i++){
274 b = &c->blocks[i];
275 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
276 i, b->part, b->addr, b->score, b->l.type, b->ref,
277 bsStr(b->l.state), bioStr(b->iostate), b->pc);
278 }
279 }
280
281 static void
282 cacheCheck(Cache *c)
283 {
284 u32int size, now;
285 int i, k, refed;
286 Block *b;
287
288 size = c->size;
289 now = c->now;
290
291 for(i = 0; i < c->nheap; i++){
292 if(c->heap[i]->heap != i)
293 sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
294 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
295 sysfatal("bad heap ordering");
296 k = (i << 1) + 1;
297 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
298 sysfatal("bad heap ordering");
299 k++;
300 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
301 sysfatal("bad heap ordering");
302 }
303
304 refed = 0;
305 for(i = 0; i < c->nblocks; i++){
306 b = &c->blocks[i];
307 if(b->data != &c->mem[i * size])
308 sysfatal("mis-blocked at %d", i);
309 if(b->ref && b->heap == BadHeap){
310 refed++;
311 }
312 }
313 if(c->nheap + refed != c->nblocks){
314 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
315 cacheDump(c);
316 }
317 assert(c->nheap + refed == c->nblocks);
318 refed = 0;
319 for(i = 0; i < c->nblocks; i++){
320 b = &c->blocks[i];
321 if(b->ref){
322 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
323 refed++;
324 }
325 }
326 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
327 }
328
329
330 /*
331 * locate the block with the oldest second to last use.
332 * remove it from the heap, and fix up the heap.
333 */
334 /* called with c->lk held */
335 static Block *
336 cacheBumpBlock(Cache *c)
337 {
338 int printed;
339 Block *b;
340
341 /*
342 * locate the block with the oldest second to last use.
343 * remove it from the heap, and fix up the heap.
344 */
345 printed = 0;
346 if(c->nheap == 0){
347 while(c->nheap == 0){
348 rwakeup(&c->flush);
349 rsleep(&c->heapwait);
350 if(c->nheap == 0){
351 printed = 1;
352 fprint(2, "%s: entire cache is busy, %d dirty "
353 "-- waking flush thread\n",
354 argv0, c->ndirty);
355 }
356 }
357 if(printed)
358 fprint(2, "%s: cache is okay again, %d dirty\n",
359 argv0, c->ndirty);
360 }
361
362 b = c->heap[0];
363 heapDel(b);
364
365 assert(b->heap == BadHeap);
366 assert(b->ref == 0);
367 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
368 assert(b->prior == nil);
369 assert(b->uhead == nil);
370
371 /*
372 * unchain the block from hash chain
373 */
374 if(b->prev){
375 *(b->prev) = b->next;
376 if(b->next)
377 b->next->prev = b->prev;
378 b->prev = nil;
379 }
380
381
382 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
383 /* set block to a reasonable state */
384 b->ref = 1;
385 b->part = PartError;
386 memset(&b->l, 0, sizeof(b->l));
387 b->iostate = BioEmpty;
388
389 return b;
390 }
391
392 /*
393 * look for a particular version of the block in the memory cache.
394 */
395 static Block *
396 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
397 int waitlock, int *lockfailure)
398 {
399 Block *b;
400 ulong h;
401
402 h = addr % c->hashSize;
403
404 if(lockfailure)
405 *lockfailure = 0;
406
407 /*
408 * look for the block in the cache
409 */
410 qlock(&c->lk);
411 for(b = c->heads[h]; b != nil; b = b->next){
412 if(b->part == part && b->addr == addr)
413 break;
414 }
415 if(b == nil || b->vers != vers){
416 qunlock(&c->lk);
417 return nil;
418 }
419 if(!waitlock && !canqlock(&b->lk)){
420 *lockfailure = 1;
421 qunlock(&c->lk);
422 return nil;
423 }
424 heapDel(b);
425 b->ref++;
426 qunlock(&c->lk);
427
428 bwatchLock(b);
429 if(waitlock)
430 qlock(&b->lk);
431 b->nlock = 1;
432
433 for(;;){
434 switch(b->iostate){
435 default:
436 abort();
437 case BioEmpty:
438 case BioLabel:
439 case BioClean:
440 case BioDirty:
441 if(b->vers != vers){
442 blockPut(b);
443 return nil;
444 }
445 return b;
446 case BioReading:
447 case BioWriting:
448 rsleep(&b->ioready);
449 break;
450 case BioVentiError:
451 blockPut(b);
452 werrstr("venti i/o error block 0x%.8ux", addr);
453 return nil;
454 case BioReadError:
455 blockPut(b);
456 werrstr("error reading block 0x%.8ux", addr);
457 return nil;
458 }
459 }
460 /* NOT REACHED */
461 }
462
463
464 /*
465 * fetch a local (on-disk) block from the memory cache.
466 * if it's not there, load it, bumping some other block.
467 */
468 Block *
469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
470 {
471 Block *b;
472 ulong h;
473
474 assert(part != PartVenti);
475
476 h = addr % c->hashSize;
477
478 /*
479 * look for the block in the cache
480 */
481 qlock(&c->lk);
482 for(b = c->heads[h]; b != nil; b = b->next){
483 if(b->part != part || b->addr != addr)
484 continue;
485 if(epoch && b->l.epoch != epoch){
486 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
487 qunlock(&c->lk);
488 werrstr(ELabelMismatch);
489 return nil;
490 }
491 heapDel(b);
492 b->ref++;
493 break;
494 }
495
496 if(b == nil){
497 b = cacheBumpBlock(c);
498
499 b->part = part;
500 b->addr = addr;
501 localToGlobal(addr, b->score);
502
503 /* chain onto correct hash */
504 b->next = c->heads[h];
505 c->heads[h] = b;
506 if(b->next != nil)
507 b->next->prev = &b->next;
508 b->prev = &c->heads[h];
509 }
510
511 qunlock(&c->lk);
512
513 /*
514 * BUG: what if the epoch changes right here?
515 * In the worst case, we could end up in some weird
516 * lock loop, because the block we want no longer exists,
517 * and instead we're trying to lock a block we have no
518 * business grabbing.
519 *
520 * For now, I'm not going to worry about it.
521 */
522
523 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
524 bwatchLock(b);
525 qlock(&b->lk);
526 b->nlock = 1;
527
528 if(part == PartData && b->iostate == BioEmpty){
529 if(!readLabel(c, &b->l, addr)){
530 blockPut(b);
531 return nil;
532 }
533 blockSetIOState(b, BioLabel);
534 }
535 if(epoch && b->l.epoch != epoch){
536 blockPut(b);
537 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
538 werrstr(ELabelMismatch);
539 return nil;
540 }
541
542 b->pc = getcallerpc(&c);
543 for(;;){
544 switch(b->iostate){
545 default:
546 abort();
547 case BioLabel:
548 if(mode == OOverWrite)
549 /*
550 * leave iostate as BioLabel because data
551 * hasn't been read.
552 */
553 return b;
554 /* fall through */
555 case BioEmpty:
556 diskRead(c->disk, b);
557 rsleep(&b->ioready);
558 break;
559 case BioClean:
560 case BioDirty:
561 return b;
562 case BioReading:
563 case BioWriting:
564 rsleep(&b->ioready);
565 break;
566 case BioReadError:
567 blockSetIOState(b, BioEmpty);
568 blockPut(b);
569 werrstr("error reading block 0x%.8ux", addr);
570 return nil;
571 }
572 }
573 /* NOT REACHED */
574 }
575
576 Block *
577 cacheLocal(Cache *c, int part, u32int addr, int mode)
578 {
579 return _cacheLocal(c, part, addr, mode, 0);
580 }
581
582 /*
583 * fetch a local (on-disk) block from the memory cache.
584 * if it's not there, load it, bumping some other block.
585 * check tag and type.
586 */
587 Block *
588 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
589 {
590 Block *b;
591
592 b = _cacheLocal(c, PartData, addr, mode, epoch);
593 if(b == nil)
594 return nil;
595 if(b->l.type != type || b->l.tag != tag){
596 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
597 argv0, addr, b->l.type, type, b->l.tag, tag);
598 werrstr(ELabelMismatch);
599 blockPut(b);
600 return nil;
601 }
602 b->pc = getcallerpc(&c);
603 return b;
604 }
605
606 /*
607 * fetch a global (Venti) block from the memory cache.
608 * if it's not there, load it, bumping some other block.
609 * check tag and type if it's really a local block in disguise.
610 */
611 Block *
612 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
613 {
614 int n;
615 Block *b;
616 ulong h;
617 u32int addr;
618
619 addr = globalToLocal(score);
620 if(addr != NilBlock){
621 b = cacheLocalData(c, addr, type, tag, mode, 0);
622 if(b)
623 b->pc = getcallerpc(&c);
624 return b;
625 }
626
627 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
628
629 /*
630 * look for the block in the cache
631 */
632 qlock(&c->lk);
633 for(b = c->heads[h]; b != nil; b = b->next){
634 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
635 continue;
636 heapDel(b);
637 b->ref++;
638 break;
639 }
640
641 if(b == nil){
642 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
643
644 b = cacheBumpBlock(c);
645
646 b->part = PartVenti;
647 b->addr = NilBlock;
648 b->l.type = type;
649 memmove(b->score, score, VtScoreSize);
650
651 /* chain onto correct hash */
652 b->next = c->heads[h];
653 c->heads[h] = b;
654 if(b->next != nil)
655 b->next->prev = &b->next;
656 b->prev = &c->heads[h];
657 }
658 qunlock(&c->lk);
659
660 bwatchLock(b);
661 qlock(&b->lk);
662 b->nlock = 1;
663 b->pc = getcallerpc(&c);
664
665 switch(b->iostate){
666 default:
667 abort();
668 case BioEmpty:
669 n = vtread(c->z, score, vtType[type], b->data, c->size);
670 if(n < 0 || vtsha1check(score, b->data, n) < 0){
671 blockSetIOState(b, BioVentiError);
672 blockPut(b);
673 werrstr(
674 "venti error reading block %V or wrong score: %r",
675 score);
676 return nil;
677 }
678 vtzeroextend(vtType[type], b->data, n, c->size);
679 blockSetIOState(b, BioClean);
680 return b;
681 case BioClean:
682 return b;
683 case BioVentiError:
684 blockPut(b);
685 werrstr("venti i/o error or wrong score, block %V", score);
686 return nil;
687 case BioReadError:
688 blockPut(b);
689 werrstr("error reading block %V", b->score);
690 return nil;
691 }
692 /* NOT REACHED */
693 }
694
695 /*
696 * allocate a new on-disk block and load it into the memory cache.
697 * BUG: if the disk is full, should we flush some of it to Venti?
698 */
699 static u32int lastAlloc;
700
701 Block *
702 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
703 {
704 FreeList *fl;
705 u32int addr;
706 Block *b;
707 int n, nwrap;
708 Label lab;
709
710 n = c->size / LabelSize;
711 fl = c->fl;
712
713 qlock(&fl->lk);
714 addr = fl->last;
715 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
716 if(b == nil){
717 fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0);
718 qunlock(&fl->lk);
719 return nil;
720 }
721 nwrap = 0;
722 for(;;){
723 if(++addr >= fl->end){
724 addr = 0;
725 if(++nwrap >= 2){
726 blockPut(b);
727 werrstr("disk is full");
728 /*
729 * try to avoid a continuous spew of console
730 * messages.
731 */
732 if (fl->last != 0)
733 fprint(2, "%s: cacheAllocBlock: xxx1 %r\n",
734 argv0);
735 fl->last = 0;
736 qunlock(&fl->lk);
737 return nil;
738 }
739 }
740 if(addr%n == 0){
741 blockPut(b);
742 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
743 if(b == nil){
744 fl->last = addr;
745 fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0);
746 qunlock(&fl->lk);
747 return nil;
748 }
749 }
750 if(!labelUnpack(&lab, b->data, addr%n))
751 continue;
752 if(lab.state == BsFree)
753 goto Found;
754 if(lab.state&BsClosed)
755 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
756 goto Found;
757 }
758 Found:
759 blockPut(b);
760 b = cacheLocal(c, PartData, addr, OOverWrite);
761 if(b == nil){
762 fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0);
763 return nil;
764 }
765 assert(b->iostate == BioLabel || b->iostate == BioClean);
766 fl->last = addr;
767 lab.type = type;
768 lab.tag = tag;
769 lab.state = BsAlloc;
770 lab.epoch = epoch;
771 lab.epochClose = ~(u32int)0;
772 if(!blockSetLabel(b, &lab, 1)){
773 fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0);
774 blockPut(b);
775 return nil;
776 }
777 vtzeroextend(vtType[type], b->data, 0, c->size);
778 if(0)diskWrite(c->disk, b);
779
780 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
781 lastAlloc = addr;
782 fl->nused++;
783 qunlock(&fl->lk);
784 b->pc = getcallerpc(&c);
785 return b;
786 }
787
788 int
789 cacheDirty(Cache *c)
790 {
791 return c->ndirty;
792 }
793
794 void
795 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
796 {
797 int n;
798 u32int addr, nused;
799 Block *b;
800 Label lab;
801 FreeList *fl;
802
803 fl = c->fl;
804 n = c->size / LabelSize;
805 *bsize = c->size;
806 qlock(&fl->lk);
807 if(fl->epochLow == epochLow){
808 *used = fl->nused;
809 *total = fl->end;
810 qunlock(&fl->lk);
811 return;
812 }
813 b = nil;
814 nused = 0;
815 for(addr=0; addr<fl->end; addr++){
816 if(addr%n == 0){
817 blockPut(b);
818 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
819 if(b == nil){
820 fprint(2, "%s: flCountUsed: loading %ux: %r\n",
821 argv0, addr/n);
822 break;
823 }
824 }
825 if(!labelUnpack(&lab, b->data, addr%n))
826 continue;
827 if(lab.state == BsFree)
828 continue;
829 if(lab.state&BsClosed)
830 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
831 continue;
832 nused++;
833 }
834 blockPut(b);
835 if(addr == fl->end){
836 fl->nused = nused;
837 fl->epochLow = epochLow;
838 }
839 *used = nused;
840 *total = fl->end;
841 qunlock(&fl->lk);
842 return;
843 }
844
845 static FreeList *
846 flAlloc(u32int end)
847 {
848 FreeList *fl;
849
850 fl = vtmallocz(sizeof(*fl));
851 fl->last = 0;
852 fl->end = end;
853 return fl;
854 }
855
856 static void
857 flFree(FreeList *fl)
858 {
859 vtfree(fl);
860 }
861
862 u32int
863 cacheLocalSize(Cache *c, int part)
864 {
865 return diskSize(c->disk, part);
866 }
867
868 /*
869 * The thread that has locked b may refer to it by
870 * multiple names. Nlock counts the number of
871 * references the locking thread holds. It will call
872 * blockPut once per reference.
873 */
874 void
875 blockDupLock(Block *b)
876 {
877 assert(b->nlock > 0);
878 b->nlock++;
879 }
880
881 /*
882 * we're done with the block.
883 * unlock it. can't use it after calling this.
884 */
885 void
886 blockPut(Block* b)
887 {
888 Cache *c;
889
890 if(b == nil)
891 return;
892
893 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
894
895 if(b->iostate == BioDirty)
896 bwatchDependency(b);
897
898 if(--b->nlock > 0)
899 return;
900
901 /*
902 * b->nlock should probably stay at zero while
903 * the block is unlocked, but diskThread and rsleep
904 * conspire to assume that they can just qlock(&b->lk); blockPut(b),
905 * so we have to keep b->nlock set to 1 even
906 * when the block is unlocked.
907 */
908 assert(b->nlock == 0);
909 b->nlock = 1;
910 // b->pc = 0;
911
912 bwatchUnlock(b);
913 qunlock(&b->lk);
914 c = b->c;
915 qlock(&c->lk);
916
917 if(--b->ref > 0){
918 qunlock(&c->lk);
919 return;
920 }
921
922 assert(b->ref == 0);
923 switch(b->iostate){
924 default:
925 b->used = c->now++;
926 heapIns(b);
927 break;
928 case BioEmpty:
929 case BioLabel:
930 if(c->nheap == 0)
931 b->used = c->now++;
932 else
933 b->used = c->heap[0]->used;
934 heapIns(b);
935 break;
936 case BioDirty:
937 break;
938 }
939 qunlock(&c->lk);
940 }
941
942 /*
943 * set the label associated with a block.
944 */
945 Block*
946 _blockSetLabel(Block *b, Label *l)
947 {
948 int lpb;
949 Block *bb;
950 u32int a;
951 Cache *c;
952
953 c = b->c;
954
955 assert(b->part == PartData);
956 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
957 lpb = c->size / LabelSize;
958 a = b->addr / lpb;
959 bb = cacheLocal(c, PartLabel, a, OReadWrite);
960 if(bb == nil){
961 blockPut(b);
962 return nil;
963 }
964 b->l = *l;
965 labelPack(l, bb->data, b->addr%lpb);
966 blockDirty(bb);
967 return bb;
968 }
969
970 int
971 blockSetLabel(Block *b, Label *l, int allocating)
972 {
973 Block *lb;
974
975 lb = _blockSetLabel(b, l);
976 if(lb == nil)
977 return 0;
978
979 /*
980 * If we're allocating the block, make sure the label (bl)
981 * goes to disk before the data block (b) itself. This is to help
982 * the blocks that in turn depend on b.
983 *
984 * Suppose bx depends on (must be written out after) b.
985 * Once we write b we'll think it's safe to write bx.
986 * Bx can't get at b unless it has a valid label, though.
987 *
988 * Allocation is the only case in which having a current label
989 * is vital because:
990 *
991 * - l.type is set at allocation and never changes.
992 * - l.tag is set at allocation and never changes.
993 * - l.state is not checked when we load blocks.
994 * - the archiver cares deeply about l.state being
995 * BaActive vs. BaCopied, but that's handled
996 * by direct calls to _blockSetLabel.
997 */
998
999 if(allocating)
1000 blockDependency(b, lb, -1, nil, nil);
1001 blockPut(lb);
1002 return 1;
1003 }
1004
1005 /*
1006 * Record that bb must be written out before b.
1007 * If index is given, we're about to overwrite the score/e
1008 * at that index in the block. Save the old value so we
1009 * can write a safer ``old'' version of the block if pressed.
1010 */
1011 void
1012 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1013 {
1014 BList *p;
1015
1016 if(bb->iostate == BioClean)
1017 return;
1018
1019 /*
1020 * Dependencies for blocks containing Entry structures
1021 * or scores must always be explained. The problem with
1022 * only explaining some of them is this. Suppose we have two
1023 * dependencies for the same field, the first explained
1024 * and the second not. We try to write the block when the first
1025 * dependency is not written but the second is. We will roll back
1026 * the first change even though the second trumps it.
1027 */
1028 if(index == -1 && bb->part == PartData)
1029 assert(b->l.type == BtData);
1030
1031 if(bb->iostate != BioDirty){
1032 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1033 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1034 abort();
1035 }
1036
1037 p = blistAlloc(bb);
1038 if(p == nil)
1039 return;
1040
1041 assert(bb->iostate == BioDirty);
1042 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1043
1044 p->part = bb->part;
1045 p->addr = bb->addr;
1046 p->type = bb->l.type;
1047 p->vers = bb->vers;
1048 p->index = index;
1049 if(p->index >= 0){
1050 /*
1051 * This test would just be b->l.type==BtDir except
1052 * we need to exclude the super block.
1053 */
1054 if(b->l.type == BtDir && b->part == PartData)
1055 entryPack(e, p->old.entry, 0);
1056 else
1057 memmove(p->old.score, score, VtScoreSize);
1058 }
1059 p->next = b->prior;
1060 b->prior = p;
1061 }
1062
1063 /*
1064 * Mark an in-memory block as dirty. If there are too many
1065 * dirty blocks, start writing some out to disk.
1066 *
1067 * If there were way too many dirty blocks, we used to
1068 * try to do some flushing ourselves, but it's just too dangerous --
1069 * it implies that the callers cannot have any of our priors locked,
1070 * but this is hard to avoid in some cases.
1071 */
1072 int
1073 blockDirty(Block *b)
1074 {
1075 Cache *c;
1076
1077 c = b->c;
1078
1079 assert(b->part != PartVenti);
1080
1081 if(b->iostate == BioDirty)
1082 return 1;
1083 assert(b->iostate == BioClean || b->iostate == BioLabel);
1084
1085 qlock(&c->lk);
1086 b->iostate = BioDirty;
1087 c->ndirty++;
1088 if(c->ndirty > (c->maxdirty>>1))
1089 rwakeup(&c->flush);
1090 qunlock(&c->lk);
1091
1092 return 1;
1093 }
1094
1095 /*
1096 * We've decided to write out b. Maybe b has some pointers to blocks
1097 * that haven't yet been written to disk. If so, construct a slightly out-of-date
1098 * copy of b that is safe to write out. (diskThread will make sure the block
1099 * remains marked as dirty.)
1100 */
1101 uchar *
1102 blockRollback(Block *b, uchar *buf)
1103 {
1104 u32int addr;
1105 BList *p;
1106 Super super;
1107
1108 /* easy case */
1109 if(b->prior == nil)
1110 return b->data;
1111
1112 memmove(buf, b->data, b->c->size);
1113 for(p=b->prior; p; p=p->next){
1114 /*
1115 * we know p->index >= 0 because blockWrite has vetted this block for us.
1116 */
1117 assert(p->index >= 0);
1118 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1119 if(b->part == PartSuper){
1120 assert(p->index == 0);
1121 superUnpack(&super, buf);
1122 addr = globalToLocal(p->old.score);
1123 if(addr == NilBlock){
1124 fprint(2, "%s: rolling back super block: "
1125 "bad replacement addr %V\n",
1126 argv0, p->old.score);
1127 abort();
1128 }
1129 super.active = addr;
1130 superPack(&super, buf);
1131 continue;
1132 }
1133 if(b->l.type == BtDir)
1134 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1135 else
1136 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1137 }
1138 return buf;
1139 }
1140
1141 /*
1142 * Try to write block b.
1143 * If b depends on other blocks:
1144 *
1145 * If the block has been written out, remove the dependency.
1146 * If the dependency is replaced by a more recent dependency,
1147 * throw it out.
1148 * If we know how to write out an old version of b that doesn't
1149 * depend on it, do that.
1150 *
1151 * Otherwise, bail.
1152 */
1153 int
1154 blockWrite(Block *b, int waitlock)
1155 {
1156 uchar *dmap;
1157 Cache *c;
1158 BList *p, **pp;
1159 Block *bb;
1160 int lockfail;
1161
1162 c = b->c;
1163
1164 if(b->iostate != BioDirty)
1165 return 1;
1166
1167 dmap = b->dmap;
1168 memset(dmap, 0, c->ndmap);
1169 pp = &b->prior;
1170 for(p=*pp; p; p=*pp){
1171 if(p->index >= 0){
1172 /* more recent dependency has succeeded; this one can go */
1173 if(dmap[p->index/8] & (1<<(p->index%8)))
1174 goto ignblock;
1175 }
1176
1177 lockfail = 0;
1178 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1179 &lockfail);
1180 if(bb == nil){
1181 if(lockfail)
1182 return 0;
1183 /* block not in cache => was written already */
1184 dmap[p->index/8] |= 1<<(p->index%8);
1185 goto ignblock;
1186 }
1187
1188 /*
1189 * same version of block is still in cache.
1190 *
1191 * the assertion is true because the block still has version p->vers,
1192 * which means it hasn't been written out since we last saw it.
1193 */
1194 if(bb->iostate != BioDirty){
1195 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1196 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1197 /* probably BioWriting if it happens? */
1198 if(bb->iostate == BioClean)
1199 goto ignblock;
1200 }
1201
1202 blockPut(bb);
1203
1204 if(p->index < 0){
1205 /*
1206 * We don't know how to temporarily undo
1207 * b's dependency on bb, so just don't write b yet.
1208 */
1209 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1210 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1211 return 0;
1212 }
1213 /* keep walking down the list */
1214 pp = &p->next;
1215 continue;
1216
1217 ignblock:
1218 *pp = p->next;
1219 blistFree(c, p);
1220 continue;
1221 }
1222
1223 /*
1224 * DiskWrite must never be called with a double-locked block.
1225 * This call to diskWrite is okay because blockWrite is only called
1226 * from the cache flush thread, which never double-locks a block.
1227 */
1228 diskWrite(c->disk, b);
1229 return 1;
1230 }
1231
1232 /*
1233 * Change the I/O state of block b.
1234 * Just an assignment except for magic in
1235 * switch statement (read comments there).
1236 */
1237 void
1238 blockSetIOState(Block *b, int iostate)
1239 {
1240 int dowakeup;
1241 Cache *c;
1242 BList *p, *q;
1243
1244 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1245
1246 c = b->c;
1247
1248 dowakeup = 0;
1249 switch(iostate){
1250 default:
1251 abort();
1252 case BioEmpty:
1253 assert(!b->uhead);
1254 break;
1255 case BioLabel:
1256 assert(!b->uhead);
1257 break;
1258 case BioClean:
1259 bwatchDependency(b);
1260 /*
1261 * If b->prior is set, it means a write just finished.
1262 * The prior list isn't needed anymore.
1263 */
1264 for(p=b->prior; p; p=q){
1265 q = p->next;
1266 blistFree(c, p);
1267 }
1268 b->prior = nil;
1269 /*
1270 * Freeing a block or just finished a write.
1271 * Move the blocks from the per-block unlink
1272 * queue to the cache unlink queue.
1273 */
1274 if(b->iostate == BioDirty || b->iostate == BioWriting){
1275 qlock(&c->lk);
1276 c->ndirty--;
1277 b->iostate = iostate; /* change here to keep in sync with ndirty */
1278 b->vers = c->vers++;
1279 if(b->uhead){
1280 /* add unlink blocks to unlink queue */
1281 if(c->uhead == nil){
1282 c->uhead = b->uhead;
1283 rwakeup(&c->unlink);
1284 }else
1285 c->utail->next = b->uhead;
1286 c->utail = b->utail;
1287 b->uhead = nil;
1288 }
1289 qunlock(&c->lk);
1290 }
1291 assert(!b->uhead);
1292 dowakeup = 1;
1293 break;
1294 case BioDirty:
1295 /*
1296 * Wrote out an old version of the block (see blockRollback).
1297 * Bump a version count, leave it dirty.
1298 */
1299 if(b->iostate == BioWriting){
1300 qlock(&c->lk);
1301 b->vers = c->vers++;
1302 qunlock(&c->lk);
1303 dowakeup = 1;
1304 }
1305 break;
1306 case BioReading:
1307 case BioWriting:
1308 /*
1309 * Adding block to disk queue. Bump reference count.
1310 * diskThread decs the count later by calling blockPut.
1311 * This is here because we need to lock c->lk to
1312 * manipulate the ref count.
1313 */
1314 qlock(&c->lk);
1315 b->ref++;
1316 qunlock(&c->lk);
1317 break;
1318 case BioReadError:
1319 case BioVentiError:
1320 /*
1321 * Oops.
1322 */
1323 dowakeup = 1;
1324 break;
1325 }
1326 b->iostate = iostate;
1327 /*
1328 * Now that the state has changed, we can wake the waiters.
1329 */
1330 if(dowakeup)
1331 rwakeupall(&b->ioready);
1332 }
1333
1334 /*
1335 * The active file system is a tree of blocks.
1336 * When we add snapshots to the mix, the entire file system
1337 * becomes a dag and thus requires a bit more care.
1338 *
1339 * The life of the file system is divided into epochs. A snapshot
1340 * ends one epoch and begins the next. Each file system block
1341 * is marked with the epoch in which it was created (b.epoch).
1342 * When the block is unlinked from the file system (closed), it is marked
1343 * with the epoch in which it was removed (b.epochClose).
1344 * Once we have discarded or archived all snapshots up to
1345 * b.epochClose, we can reclaim the block.
1346 *
1347 * If a block was created in a past epoch but is not yet closed,
1348 * it is treated as copy-on-write. Of course, in order to insert the
1349 * new pointer into the tree, the parent must be made writable,
1350 * and so on up the tree. The recursion stops because the root
1351 * block is always writable.
1352 *
1353 * If blocks are never closed, they will never be reused, and
1354 * we will run out of disk space. But marking a block as closed
1355 * requires some care about dependencies and write orderings.
1356 *
1357 * (1) If a block p points at a copy-on-write block b and we
1358 * copy b to create bb, then p must be written out after bb and
1359 * lbb (bb's label block).
1360 *
1361 * (2) We have to mark b as closed, but only after we switch
1362 * the pointer, so lb must be written out after p. In fact, we
1363 * can't even update the in-memory copy, or the cache might
1364 * mistakenly give out b for reuse before p gets written.
1365 *
1366 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1367 * The caller is expected to record a "p after bb" dependency
1368 * to finish (1), and also expected to call blockRemoveLink
1369 * to arrange for (2) to happen once p is written.
1370 *
1371 * Until (2) happens, some pieces of the code (e.g., the archiver)
1372 * still need to know whether a block has been copied, so we
1373 * set the BsCopied bit in the label and force that to disk *before*
1374 * the copy gets written out.
1375 */
1376 Block*
1377 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1378 {
1379 Block *bb, *lb;
1380 Label l;
1381
1382 if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1383 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1384 argv0, b->addr, &b->l, elo, ehi);
1385
1386 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1387 if(bb == nil){
1388 blockPut(b);
1389 return nil;
1390 }
1391
1392 /*
1393 * Update label so we know the block has been copied.
1394 * (It will be marked closed once it has been unlinked from
1395 * the tree.) This must follow cacheAllocBlock since we
1396 * can't be holding onto lb when we call cacheAllocBlock.
1397 */
1398 if((b->l.state&BsCopied)==0)
1399 if(b->part == PartData){ /* not the superblock */
1400 l = b->l;
1401 l.state |= BsCopied;
1402 lb = _blockSetLabel(b, &l);
1403 if(lb == nil){
1404 /* can't set label => can't copy block */
1405 blockPut(b);
1406 l.type = BtMax;
1407 l.state = BsFree;
1408 l.epoch = 0;
1409 l.epochClose = 0;
1410 l.tag = 0;
1411 blockSetLabel(bb, &l, 0);
1412 blockPut(bb);
1413 return nil;
1414 }
1415 blockDependency(bb, lb, -1, nil, nil);
1416 blockPut(lb);
1417 }
1418
1419 memmove(bb->data, b->data, b->c->size);
1420 blockDirty(bb);
1421 blockPut(b);
1422 return bb;
1423 }
1424
1425 /*
1426 * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1427 * If recurse is set, we are unlinking all of bb's children as well.
1428 *
1429 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add
1430 * the relevant information to b's list of unlinked blocks. Once b is written,
1431 * the list will be queued for processing.
1432 *
1433 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1434 */
1435 void
1436 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1437 {
1438 BList *p, **pp, bl;
1439
1440 /* remove bb from prior list */
1441 for(pp=&b->prior; (p=*pp)!=nil; ){
1442 if(p->part == PartData && p->addr == addr){
1443 *pp = p->next;
1444 blistFree(b->c, p);
1445 }else
1446 pp = &p->next;
1447 }
1448
1449 bl.part = PartData;
1450 bl.addr = addr;
1451 bl.type = type;
1452 bl.tag = tag;
1453 if(b->l.epoch == 0)
1454 assert(b->part == PartSuper);
1455 bl.epoch = b->l.epoch;
1456 bl.next = nil;
1457 bl.recurse = recurse;
1458
1459 if(b->part == PartSuper && b->iostate == BioClean)
1460 p = nil;
1461 else
1462 p = blistAlloc(b);
1463 if(p == nil){
1464 /*
1465 * b has already been written to disk.
1466 */
1467 doRemoveLink(b->c, &bl);
1468 return;
1469 }
1470
1471 /* Uhead is only processed when the block goes from Dirty -> Clean */
1472 assert(b->iostate == BioDirty);
1473
1474 *p = bl;
1475 if(b->uhead == nil)
1476 b->uhead = p;
1477 else
1478 b->utail->next = p;
1479 b->utail = p;
1480 }
1481
1482 /*
1483 * Process removal of a single block and perhaps its children.
1484 */
1485 static void
1486 doRemoveLink(Cache *c, BList *p)
1487 {
1488 int i, n, recurse;
1489 u32int a;
1490 Block *b;
1491 Label l;
1492 BList bl;
1493
1494 recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1495
1496 /*
1497 * We're not really going to overwrite b, but if we're not
1498 * going to look at its contents, there is no point in reading
1499 * them from the disk.
1500 */
1501 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1502 if(b == nil)
1503 return;
1504
1505 /*
1506 * When we're unlinking from the superblock, close with the next epoch.
1507 */
1508 if(p->epoch == 0)
1509 p->epoch = b->l.epoch+1;
1510
1511 /* sanity check */
1512 if(b->l.epoch > p->epoch){
1513 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1514 argv0, b->l.epoch, p->epoch);
1515 blockPut(b);
1516 return;
1517 }
1518
1519 if(recurse){
1520 n = c->size / VtScoreSize;
1521 for(i=0; i<n; i++){
1522 a = globalToLocal(b->data + i*VtScoreSize);
1523 if(a == NilBlock || !readLabel(c, &l, a))
1524 continue;
1525 if(l.state&BsClosed)
1526 continue;
1527 /*
1528 * If stack space becomes an issue...
1529 p->addr = a;
1530 p->type = l.type;
1531 p->tag = l.tag;
1532 doRemoveLink(c, p);
1533 */
1534
1535 bl.part = PartData;
1536 bl.addr = a;
1537 bl.type = l.type;
1538 bl.tag = l.tag;
1539 bl.epoch = p->epoch;
1540 bl.next = nil;
1541 bl.recurse = 1;
1542 /* give up the block lock - share with others */
1543 blockPut(b);
1544 doRemoveLink(c, &bl);
1545 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1546 if(b == nil){
1547 fprint(2, "%s: warning: lost block in doRemoveLink\n",
1548 argv0);
1549 return;
1550 }
1551 }
1552 }
1553
1554 l = b->l;
1555 l.state |= BsClosed;
1556 l.epochClose = p->epoch;
1557 if(l.epochClose == l.epoch){
1558 qlock(&c->fl->lk);
1559 if(l.epoch == c->fl->epochLow)
1560 c->fl->nused--;
1561 blockSetLabel(b, &l, 0);
1562 qunlock(&c->fl->lk);
1563 }else
1564 blockSetLabel(b, &l, 0);
1565 blockPut(b);
1566 }
1567
1568 /*
1569 * Allocate a BList so that we can record a dependency
1570 * or queue a removal related to block b.
1571 * If we can't find a BList, we write out b and return nil.
1572 */
1573 static BList *
1574 blistAlloc(Block *b)
1575 {
1576 Cache *c;
1577 BList *p;
1578
1579 if(b->iostate != BioDirty){
1580 /*
1581 * should not happen anymore -
1582 * blockDirty used to flush but no longer does.
1583 */
1584 assert(b->iostate == BioClean);
1585 fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1586 return nil;
1587 }
1588
1589 c = b->c;
1590 qlock(&c->lk);
1591 if(c->blfree == nil){
1592 /*
1593 * No free BLists. What are our options?
1594 */
1595
1596 /* Block has no priors? Just write it. */
1597 if(b->prior == nil){
1598 qunlock(&c->lk);
1599 diskWriteAndWait(c->disk, b);
1600 return nil;
1601 }
1602
1603 /*
1604 * Wake the flush thread, which will hopefully free up
1605 * some BLists for us. We used to flush a block from
1606 * our own prior list and reclaim that BList, but this is
1607 * a no-no: some of the blocks on our prior list may
1608 * be locked by our caller. Or maybe their label blocks
1609 * are locked by our caller. In any event, it's too hard
1610 * to make sure we can do I/O for ourselves. Instead,
1611 * we assume the flush thread will find something.
1612 * (The flush thread never blocks waiting for a block,
1613 * so it can't deadlock like we can.)
1614 */
1615 while(c->blfree == nil){
1616 rwakeup(&c->flush);
1617 rsleep(&c->blrend);
1618 if(c->blfree == nil)
1619 fprint(2, "%s: flushing for blists\n", argv0);
1620 }
1621 }
1622
1623 p = c->blfree;
1624 c->blfree = p->next;
1625 qunlock(&c->lk);
1626 return p;
1627 }
1628
1629 static void
1630 blistFree(Cache *c, BList *bl)
1631 {
1632 qlock(&c->lk);
1633 bl->next = c->blfree;
1634 c->blfree = bl;
1635 rwakeup(&c->blrend);
1636 qunlock(&c->lk);
1637 }
1638
1639 char*
1640 bsStr(int state)
1641 {
1642 static char s[100];
1643
1644 if(state == BsFree)
1645 return "Free";
1646 if(state == BsBad)
1647 return "Bad";
1648
1649 sprint(s, "%x", state);
1650 if(!(state&BsAlloc))
1651 strcat(s, ",Free"); /* should not happen */
1652 if(state&BsCopied)
1653 strcat(s, ",Copied");
1654 if(state&BsVenti)
1655 strcat(s, ",Venti");
1656 if(state&BsClosed)
1657 strcat(s, ",Closed");
1658 return s;
1659 }
1660
1661 char *
1662 bioStr(int iostate)
1663 {
1664 switch(iostate){
1665 default:
1666 return "Unknown!!";
1667 case BioEmpty:
1668 return "Empty";
1669 case BioLabel:
1670 return "Label";
1671 case BioClean:
1672 return "Clean";
1673 case BioDirty:
1674 return "Dirty";
1675 case BioReading:
1676 return "Reading";
1677 case BioWriting:
1678 return "Writing";
1679 case BioReadError:
1680 return "ReadError";
1681 case BioVentiError:
1682 return "VentiError";
1683 case BioMax:
1684 return "Max";
1685 }
1686 }
1687
1688 static char *bttab[] = {
1689 "BtData",
1690 "BtData+1",
1691 "BtData+2",
1692 "BtData+3",
1693 "BtData+4",
1694 "BtData+5",
1695 "BtData+6",
1696 "BtData+7",
1697 "BtDir",
1698 "BtDir+1",
1699 "BtDir+2",
1700 "BtDir+3",
1701 "BtDir+4",
1702 "BtDir+5",
1703 "BtDir+6",
1704 "BtDir+7",
1705 };
1706
1707 char*
1708 btStr(int type)
1709 {
1710 if(type < nelem(bttab))
1711 return bttab[type];
1712 return "unknown";
1713 }
1714
1715 int
1716 labelFmt(Fmt *f)
1717 {
1718 Label *l;
1719
1720 l = va_arg(f->args, Label*);
1721 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1722 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1723 }
1724
1725 int
1726 scoreFmt(Fmt *f)
1727 {
1728 uchar *v;
1729 int i;
1730 u32int addr;
1731
1732 v = va_arg(f->args, uchar*);
1733 if(v == nil){
1734 fmtprint(f, "*");
1735 }else if((addr = globalToLocal(v)) != NilBlock)
1736 fmtprint(f, "0x%.8ux", addr);
1737 else{
1738 for(i = 0; i < VtScoreSize; i++)
1739 fmtprint(f, "%2.2ux", v[i]);
1740 }
1741
1742 return 0;
1743 }
1744
1745 static int
1746 upHeap(int i, Block *b)
1747 {
1748 Block *bb;
1749 u32int now;
1750 int p;
1751 Cache *c;
1752
1753 c = b->c;
1754 now = c->now;
1755 for(; i != 0; i = p){
1756 p = (i - 1) >> 1;
1757 bb = c->heap[p];
1758 if(b->used - now >= bb->used - now)
1759 break;
1760 c->heap[i] = bb;
1761 bb->heap = i;
1762 }
1763 c->heap[i] = b;
1764 b->heap = i;
1765
1766 return i;
1767 }
1768
1769 static int
1770 downHeap(int i, Block *b)
1771 {
1772 Block *bb;
1773 u32int now;
1774 int k;
1775 Cache *c;
1776
1777 c = b->c;
1778 now = c->now;
1779 for(; ; i = k){
1780 k = (i << 1) + 1;
1781 if(k >= c->nheap)
1782 break;
1783 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1784 k++;
1785 bb = c->heap[k];
1786 if(b->used - now <= bb->used - now)
1787 break;
1788 c->heap[i] = bb;
1789 bb->heap = i;
1790 }
1791 c->heap[i] = b;
1792 b->heap = i;
1793 return i;
1794 }
1795
1796 /*
1797 * Delete a block from the heap.
1798 * Called with c->lk held.
1799 */
1800 static void
1801 heapDel(Block *b)
1802 {
1803 int i, si;
1804 Cache *c;
1805
1806 c = b->c;
1807
1808 si = b->heap;
1809 if(si == BadHeap)
1810 return;
1811 b->heap = BadHeap;
1812 c->nheap--;
1813 if(si == c->nheap)
1814 return;
1815 b = c->heap[c->nheap];
1816 i = upHeap(si, b);
1817 if(i == si)
1818 downHeap(i, b);
1819 }
1820
1821 /*
1822 * Insert a block into the heap.
1823 * Called with c->lk held.
1824 */
1825 static void
1826 heapIns(Block *b)
1827 {
1828 assert(b->heap == BadHeap);
1829 upHeap(b->c->nheap++, b);
1830 rwakeup(&b->c->heapwait);
1831 }
1832
1833 /*
1834 * Get just the label for a block.
1835 */
1836 int
1837 readLabel(Cache *c, Label *l, u32int addr)
1838 {
1839 int lpb;
1840 Block *b;
1841 u32int a;
1842
1843 lpb = c->size / LabelSize;
1844 a = addr / lpb;
1845 b = cacheLocal(c, PartLabel, a, OReadOnly);
1846 if(b == nil){
1847 blockPut(b);
1848 return 0;
1849 }
1850
1851 if(!labelUnpack(l, b->data, addr%lpb)){
1852 blockPut(b);
1853 return 0;
1854 }
1855 blockPut(b);
1856 return 1;
1857 }
1858
1859 /*
1860 * Process unlink queue.
1861 * Called with c->lk held.
1862 */
1863 static void
1864 unlinkBody(Cache *c)
1865 {
1866 BList *p;
1867
1868 while(c->uhead != nil){
1869 p = c->uhead;
1870 c->uhead = p->next;
1871 qunlock(&c->lk);
1872 doRemoveLink(c, p);
1873 qlock(&c->lk);
1874 p->next = c->blfree;
1875 c->blfree = p;
1876 }
1877 }
1878
1879 /*
1880 * Occasionally unlink the blocks on the cache unlink queue.
1881 */
1882 static void
1883 unlinkThread(void *a)
1884 {
1885 Cache *c = a;
1886
1887 threadsetname("unlink");
1888
1889 qlock(&c->lk);
1890 for(;;){
1891 while(c->uhead == nil && c->die.l == nil)
1892 rsleep(&c->unlink);
1893 if(c->die.l != nil)
1894 break;
1895 unlinkBody(c);
1896 }
1897 c->ref--;
1898 rwakeup(&c->die);
1899 qunlock(&c->lk);
1900 }
1901
1902 static int
1903 baddrCmp(const void *a0, const void *a1)
1904 {
1905 BAddr *b0, *b1;
1906 b0 = (BAddr*)a0;
1907 b1 = (BAddr*)a1;
1908
1909 if(b0->part < b1->part)
1910 return -1;
1911 if(b0->part > b1->part)
1912 return 1;
1913 if(b0->addr < b1->addr)
1914 return -1;
1915 if(b0->addr > b1->addr)
1916 return 1;
1917 return 0;
1918 }
1919
1920 /*
1921 * Scan the block list for dirty blocks; add them to the list c->baddr.
1922 */
1923 static void
1924 flushFill(Cache *c)
1925 {
1926 int i, ndirty;
1927 BAddr *p;
1928 Block *b;
1929
1930 qlock(&c->lk);
1931 if(c->ndirty == 0){
1932 qunlock(&c->lk);
1933 return;
1934 }
1935
1936 p = c->baddr;
1937 ndirty = 0;
1938 for(i=0; i<c->nblocks; i++){
1939 b = c->blocks + i;
1940 if(b->part == PartError)
1941 continue;
1942 if(b->iostate == BioDirty || b->iostate == BioWriting)
1943 ndirty++;
1944 if(b->iostate != BioDirty)
1945 continue;
1946 p->part = b->part;
1947 p->addr = b->addr;
1948 p->vers = b->vers;
1949 p++;
1950 }
1951 if(ndirty != c->ndirty){
1952 fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1953 argv0, c->ndirty, ndirty);
1954 c->ndirty = ndirty;
1955 }
1956 qunlock(&c->lk);
1957
1958 c->bw = p - c->baddr;
1959 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1960 }
1961
1962 /*
1963 * This is not thread safe, i.e. it can't be called from multiple threads.
1964 *
1965 * It's okay how we use it, because it only gets called in
1966 * the flushThread. And cacheFree, but only after
1967 * cacheFree has killed off the flushThread.
1968 */
1969 static int
1970 cacheFlushBlock(Cache *c)
1971 {
1972 Block *b;
1973 BAddr *p;
1974 int lockfail, nfail;
1975
1976 nfail = 0;
1977 for(;;){
1978 if(c->br == c->be){
1979 if(c->bw == 0 || c->bw == c->be)
1980 flushFill(c);
1981 c->br = 0;
1982 c->be = c->bw;
1983 c->bw = 0;
1984 c->nflush = 0;
1985 }
1986
1987 if(c->br == c->be)
1988 return 0;
1989 p = c->baddr + c->br;
1990 c->br++;
1991 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
1992 &lockfail);
1993
1994 if(b && blockWrite(b, Nowaitlock)){
1995 c->nflush++;
1996 blockPut(b);
1997 return 1;
1998 }
1999 if(b)
2000 blockPut(b);
2001
2002 /*
2003 * Why didn't we write the block?
2004 */
2005
2006 /* Block already written out */
2007 if(b == nil && !lockfail)
2008 continue;
2009
2010 /* Failed to acquire lock; sleep if happens a lot. */
2011 if(lockfail && ++nfail > 100){
2012 sleep(500);
2013 nfail = 0;
2014 }
2015 /* Requeue block. */
2016 if(c->bw < c->be)
2017 c->baddr[c->bw++] = *p;
2018 }
2019 }
2020
2021 /*
2022 * Occasionally flush dirty blocks from memory to the disk.
2023 */
2024 static void
2025 flushThread(void *a)
2026 {
2027 Cache *c = a;
2028 int i;
2029
2030 threadsetname("flush");
2031 qlock(&c->lk);
2032 while(c->die.l == nil){
2033 rsleep(&c->flush);
2034 qunlock(&c->lk);
2035 for(i=0; i<FlushSize; i++)
2036 if(!cacheFlushBlock(c)){
2037 /*
2038 * If i==0, could be someone is waking us repeatedly
2039 * to flush the cache but there's no work to do.
2040 * Pause a little.
2041 */
2042 if(i==0){
2043 // fprint(2, "%s: flushthread found "
2044 // "nothing to flush - %d dirty\n",
2045 // argv0, c->ndirty);
2046 sleep(250);
2047 }
2048 break;
2049 }
2050 if(i==0 && c->ndirty){
2051 /*
2052 * All the blocks are being written right now -- there's nothing to do.
2053 * We might be spinning with cacheFlush though -- he'll just keep
2054 * kicking us until c->ndirty goes down. Probably we should sleep
2055 * on something that the diskThread can kick, but for now we'll
2056 * just pause for a little while waiting for disks to finish.
2057 */
2058 sleep(100);
2059 }
2060 qlock(&c->lk);
2061 rwakeupall(&c->flushwait);
2062 }
2063 c->ref--;
2064 rwakeup(&c->die);
2065 qunlock(&c->lk);
2066 }
2067
2068 /*
2069 * Flush the cache.
2070 */
2071 void
2072 cacheFlush(Cache *c, int wait)
2073 {
2074 qlock(&c->lk);
2075 if(wait){
2076 while(c->ndirty){
2077 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2078 // c->ndirty, c->uhead);
2079 rwakeup(&c->flush);
2080 rsleep(&c->flushwait);
2081 }
2082 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2083 }else if(c->ndirty)
2084 rwakeup(&c->flush);
2085 qunlock(&c->lk);
2086 }
2087
2088 /*
2089 * Kick the flushThread every 30 seconds.
2090 */
2091 static void
2092 cacheSync(void *v)
2093 {
2094 Cache *c;
2095
2096 c = v;
2097 cacheFlush(c, 0);
2098 }