URI:
       tutf.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tutf.c (5985B)
       ---
            1 #ifdef PLAN9
            2 #include        <u.h>
            3 #include        <libc.h>
            4 #include        <bio.h>
            5 #ifdef PLAN9PORT
            6 #include        <errno.h>
            7 #else
            8 extern int errno;
            9 #endif
           10 #else
           11 #include        <sys/types.h>
           12 #include        <stdio.h>
           13 #include        <stdlib.h>
           14 #include        <string.h>
           15 #include        <unistd.h>
           16 #include        <errno.h>
           17 #include        "plan9.h"
           18 #endif
           19 #include        "hdr.h"
           20 #ifndef EILSEQ
           21 #define EILSEQ 9998
           22 #endif
           23 
           24 /*
           25         the our_* routines are implementations for the corresponding library
           26         routines. for a while, i tried to actually name them wctomb etc
           27         but stopped that after i found a system which made wchar_t an
           28         unsigned char.
           29 */
           30 
           31 int our_wctomb(char *s, unsigned long wc);
           32 int our_mbtowc(unsigned long *p, char *s, unsigned n);
           33 int runetoisoutf(char *str, Rune *rune);
           34 int fullisorune(char *str, int n);
           35 int isochartorune(Rune *rune, char *str);
           36 
           37 void
           38 utf_in(int fd, long *notused, struct convert *out)
           39 {
           40         char buf[N];
           41         int i, j, c, n, tot;
           42         ulong l;
           43 
           44         USED(notused);
           45         tot = 0;
           46         while((n = read(fd, buf+tot, N-tot)) >= 0){
           47                 tot += n;
           48                 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
           49                         c = our_mbtowc(&l, buf+i, tot-i);
           50                         if(c == -1){
           51                                 if(squawk)
           52                                         EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
           53                                 if(clean){
           54                                         i++;
           55                                         continue;
           56                                 }
           57                                 nerrors++;
           58                                 l = Runeerror;
           59                                 c = 1;
           60                         }
           61                         runes[j++] = l;
           62                         i += c;
           63                 }
           64                 OUT(out, runes, j);
           65                 tot -= i;
           66                 ninput += i;
           67                 if(tot)
           68                         memmove(buf, buf+i, tot);
           69                 if(n == 0)
           70                         break;
           71         }
           72         OUT(out, runes, 0);
           73 }
           74 
           75 void
           76 utf_out(Rune *base, int n, long *notused)
           77 {
           78         char *p;
           79         Rune *r;
           80 
           81         USED(notused);
           82         nrunes += n;
           83         for(r = base, p = obuf; n-- > 0; r++){
           84                 p += our_wctomb(p, *r);
           85         }
           86         noutput += p-obuf;
           87         write(1, obuf, p-obuf);
           88 }
           89 
           90 void
           91 isoutf_in(int fd, long *notused, struct convert *out)
           92 {
           93         char buf[N];
           94         int i, j, c, n, tot;
           95 
           96         USED(notused);
           97         tot = 0;
           98         while((n = read(fd, buf+tot, N-tot)) >= 0){
           99                 tot += n;
          100                 for(i=j=0; i<tot; ){
          101                         if(!fullisorune(buf+i, tot-i))
          102                                 break;
          103                         c = isochartorune(&runes[j], buf+i);
          104                         if(runes[j] == Runeerror && c == 1){
          105                                 if(squawk)
          106                                         EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
          107                                 if(clean){
          108                                         i++;
          109                                         continue;
          110                                 }
          111                                 nerrors++;
          112                         }
          113                         j++;
          114                         i += c;
          115                 }
          116                 OUT(out, runes, j);
          117                 tot -= i;
          118                 ninput += i;
          119                 if(tot)
          120                         memmove(buf, buf+i, tot);
          121                 if(n == 0)
          122                         break;
          123         }
          124         OUT(out, runes, 0);
          125 }
          126 
          127 void
          128 isoutf_out(Rune *base, int n, long *notused)
          129 {
          130         char *p;
          131         Rune *r;
          132 
          133         USED(notused);
          134         nrunes += n;
          135         for(r = base, p = obuf; n-- > 0; r++)
          136                 p += runetoisoutf(p, r);
          137         noutput += p-obuf;
          138         write(1, obuf, p-obuf);
          139 }
          140 
          141 
          142 int
          143 isochartorune(Rune *rune, char *str)
          144 {
          145         return chartorune(rune, str);
          146 }
          147 
          148 int
          149 runetoisoutf(char *str, Rune *rune)
          150 {
          151         return runetochar(str, rune);
          152 }
          153 
          154 int
          155 fullisorune(char *str, int n)
          156 {
          157         return fullrune(str, n);
          158 }
          159 
          160 enum
          161 {
          162         T1        = 0x00,
          163         Tx        = 0x80,
          164         T2        = 0xC0,
          165         T3        = 0xE0,
          166         T4        = 0xF0,
          167         T5        = 0xF8,
          168         T6        = 0xFC,
          169 
          170         Bit1        = 7,
          171         Bitx        = 6,
          172         Bit2        = 5,
          173         Bit3        = 4,
          174         Bit4        = 3,
          175         Bit5        = 2,
          176         Bit6        = 2,
          177 
          178         Mask1        = (1<<Bit1)-1,
          179         Maskx        = (1<<Bitx)-1,
          180         Mask2        = (1<<Bit2)-1,
          181         Mask3        = (1<<Bit3)-1,
          182         Mask4        = (1<<Bit4)-1,
          183         Mask5        = (1<<Bit5)-1,
          184         Mask6        = (1<<Bit6)-1,
          185 
          186         Wchar1        = (1UL<<Bit1)-1,
          187         Wchar2        = (1UL<<(Bit2+Bitx))-1,
          188         Wchar3        = (1UL<<(Bit3+2*Bitx))-1,
          189         Wchar4        = (1UL<<(Bit4+3*Bitx))-1,
          190         Wchar5        = (1UL<<(Bit5+4*Bitx))-1
          191 };
          192 
          193 int
          194 our_wctomb(char *s, unsigned long wc)
          195 {
          196         if(s == 0)
          197                 return 0;                /* no shift states */
          198         if(wc & ~Wchar2) {
          199                 if(wc & ~Wchar4) {
          200                         if(wc & ~Wchar5) {
          201                                 /* 6 bytes */
          202                                 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
          203                                 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
          204                                 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
          205                                 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
          206                                 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
          207                                 s[5] = Tx |  (wc & Maskx);
          208                                 return 6;
          209                         }
          210                         /* 5 bytes */
          211                         s[0] = T5 |  (wc >> 4*Bitx);
          212                         s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
          213                         s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
          214                         s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
          215                         s[4] = Tx |  (wc & Maskx);
          216                         return 5;
          217                 }
          218                 if(wc & ~Wchar3) {
          219                         /* 4 bytes */
          220                         s[0] = T4 |  (wc >> 3*Bitx);
          221                         s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
          222                         s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
          223                         s[3] = Tx |  (wc & Maskx);
          224                         return 4;
          225                 }
          226                 /* 3 bytes */
          227                 s[0] = T3 |  (wc >> 2*Bitx);
          228                 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
          229                 s[2] = Tx |  (wc & Maskx);
          230                 return 3;
          231         }
          232         if(wc & ~Wchar1) {
          233                 /* 2 bytes */
          234                 s[0] = T2 | (wc >> 1*Bitx);
          235                 s[1] = Tx | (wc & Maskx);
          236                 return 2;
          237         }
          238         /* 1 byte */
          239         s[0] = T1 | wc;
          240         return 1;
          241 }
          242 
          243 int
          244 our_mbtowc(unsigned long *p, char *s, unsigned n)
          245 {
          246         uchar *us;
          247         int c0, c1, c2, c3, c4, c5;
          248         unsigned long wc;
          249 
          250         if(s == 0)
          251                 return 0;                /* no shift states */
          252 
          253         if(n < 1)
          254                 goto bad;
          255         us = (uchar*)s;
          256         c0 = us[0];
          257         if(c0 >= T3) {
          258                 if(n < 3)
          259                         goto bad;
          260                 c1 = us[1] ^ Tx;
          261                 c2 = us[2] ^ Tx;
          262                 if((c1|c2) & T2)
          263                         goto bad;
          264                 if(c0 >= T5) {
          265                         if(n < 5)
          266                                 goto bad;
          267                         c3 = us[3] ^ Tx;
          268                         c4 = us[4] ^ Tx;
          269                         if((c3|c4) & T2)
          270                                 goto bad;
          271                         if(c0 >= T6) {
          272                                 /* 6 bytes */
          273                                 if(n < 6)
          274                                         goto bad;
          275                                 c5 = us[5] ^ Tx;
          276                                 if(c5 & T2)
          277                                         goto bad;
          278                                 wc = ((((((((((c0 & Mask6) << Bitx) |
          279                                         c1) << Bitx) | c2) << Bitx) |
          280                                         c3) << Bitx) | c4) << Bitx) | c5;
          281                                 if(wc <= Wchar5)
          282                                         goto bad;
          283                                 *p = wc;
          284                                 return 6;
          285                         }
          286                         /* 5 bytes */
          287                         wc = ((((((((c0 & Mask5) << Bitx) |
          288                                 c1) << Bitx) | c2) << Bitx) |
          289                                 c3) << Bitx) | c4;
          290                         if(wc <= Wchar4)
          291                                 goto bad;
          292                         *p = wc;
          293                         return 5;
          294                 }
          295                 if(c0 >= T4) {
          296                         /* 4 bytes */
          297                         if(n < 4)
          298                                 goto bad;
          299                         c3 = us[3] ^ Tx;
          300                         if(c3 & T2)
          301                                 goto bad;
          302                         wc = ((((((c0 & Mask4) << Bitx) |
          303                                 c1) << Bitx) | c2) << Bitx) |
          304                                 c3;
          305                         if(wc <= Wchar3)
          306                                 goto bad;
          307                         *p = wc;
          308                         return 4;
          309                 }
          310                 /* 3 bytes */
          311                 wc = ((((c0 & Mask3) << Bitx) |
          312                         c1) << Bitx) | c2;
          313                 if(wc <= Wchar2)
          314                         goto bad;
          315                 *p = wc;
          316                 return 3;
          317         }
          318         if(c0 >= T2) {
          319                 /* 2 bytes */
          320                 if(n < 2)
          321                         goto bad;
          322                 c1 = us[1] ^ Tx;
          323                 if(c1 & T2)
          324                         goto bad;
          325                 wc = ((c0 & Mask2) << Bitx) |
          326                         c1;
          327                 if(wc <= Wchar1)
          328                         goto bad;
          329                 *p = wc;
          330                 return 2;
          331         }
          332         /* 1 byte */
          333         if(c0 >= Tx)
          334                 goto bad;
          335         *p = c0;
          336         return 1;
          337 
          338 bad:
          339         errno = EILSEQ;
          340         return -1;
          341 }