URI:
       tconv_jis.c - plan9port - [fork] Plan 9 from user space
  HTML git clone git://src.adamsgaard.dk/plan9port
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       tconv_jis.c (11016B)
       ---
            1 #ifdef        PLAN9
            2 #include        <u.h>
            3 #include        <libc.h>
            4 #include        <bio.h>
            5 #else
            6 #include        <stdio.h>
            7 #include        <unistd.h>
            8 #include        "plan9.h"
            9 #endif
           10 #include        "hdr.h"
           11 #include        "conv.h"
           12 #include        "kuten208.h"
           13 #include        "jis.h"
           14 
           15 /*
           16         a state machine for interpreting all sorts of encodings
           17 */
           18 static void
           19 alljis(int c, Rune **r, long input_loc)
           20 {
           21         static enum { state0, state1, state2, state3, state4 } state = state0;
           22         static int set8 = 0;
           23         static int japan646 = 0;
           24         static int lastc;
           25         int n;
           26         long l;
           27 
           28 again:
           29         switch(state)
           30         {
           31         case state0:        /* idle state */
           32                 if(c == ESC){ state = state1; return; }
           33                 if(c < 0) return;
           34                 if(!set8 && (c < 128)){
           35                         if(japan646){
           36                                 switch(c)
           37                                 {
           38                                 case '\\':        emit(0xA5); return;        /* yen */
           39                                 case '~':        emit(0xAF); return;        /* spacing macron */
           40                                 default:        emit(c); return;
           41                                 }
           42                         } else {
           43                                 emit(c);
           44                                 return;
           45                         }
           46                 }
           47                 if(c < 0x21){        /* guard against bogus characters in JIS mode */
           48                         if(squawk)
           49                                 EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
           50                         emit(c);
           51                         return;
           52                 }
           53                 lastc = c; state = state4; return;
           54 
           55         case state1:        /* seen an escape */
           56                 if(c == '$'){ state = state2; return; }
           57                 if(c == '('){ state = state3; return; }
           58                 emit(ESC); state = state0; goto again;
           59 
           60         case state2:        /* may be shifting into JIS */
           61                 if((c == '@') || (c == 'B')){
           62                         set8 = 1; state = state0; return;
           63                 }
           64                 emit(ESC); emit('$'); state = state0; goto again;
           65 
           66         case state3:        /* may be shifting out of JIS */
           67                 if((c == 'J') || (c == 'H') || (c == 'B')){
           68                         japan646 = (c == 'J');
           69                         set8 = 0; state = state0; return;
           70                 }
           71                 emit(ESC); emit('('); state = state0; goto again;
           72 
           73         case state4:        /* two part char */
           74                 if(c < 0){
           75                         if(squawk)
           76                                 EPR "%s: unexpected EOF in %s\n", argv0, file);
           77                         c = 0x21 | (lastc&0x80);
           78                 }
           79                 if(CANS2J(lastc, c)){        /* ms dos sjis */
           80                         int hi = lastc, lo = c;
           81                         S2J(hi, lo);                        /* convert to 208 */
           82                         n = hi*100 + lo - 3232;                /* convert to kuten208 */
           83                 } else
           84                         n = (lastc&0x7F)*100 + (c&0x7f) - 3232;        /* kuten208 */
           85                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
           86                         nerrors++;
           87                         if(squawk)
           88                                 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
           89                         if(!clean)
           90                                 emit(BADMAP);
           91                 } else {
           92                         if(l < 0){
           93                                 l = -l;
           94                                 if(squawk)
           95                                         EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
           96                         }
           97                         emit(l);
           98                 }
           99                 state = state0;
          100         }
          101 }
          102 
          103 /*
          104         a state machine for interpreting ms-kanji == shift-jis.
          105 */
          106 static void
          107 ms(int c, Rune **r, long input_loc)
          108 {
          109         static enum { state0, state1, state2, state3, state4 } state = state0;
          110         static int set8 = 0;
          111         static int japan646 = 0;
          112         static int lastc;
          113         int n;
          114         long l;
          115 
          116 again:
          117         switch(state)
          118         {
          119         case state0:        /* idle state */
          120                 if(c == ESC){ state = state1; return; }
          121                 if(c < 0) return;
          122                 if(!set8 && (c < 128)){
          123                         if(japan646){
          124                                 switch(c)
          125                                 {
          126                                 case '\\':        emit(0xA5); return;        /* yen */
          127                                 case '~':        emit(0xAF); return;        /* spacing macron */
          128                                 default:        emit(c); return;
          129                                 }
          130                         } else {
          131                                 emit(c);
          132                                 return;
          133                         }
          134                 }
          135                 lastc = c; state = state4; return;
          136 
          137         case state1:        /* seen an escape */
          138                 if(c == '$'){ state = state2; return; }
          139                 if(c == '('){ state = state3; return; }
          140                 emit(ESC); state = state0; goto again;
          141 
          142         case state2:        /* may be shifting into JIS */
          143                 if((c == '@') || (c == 'B')){
          144                         set8 = 1; state = state0; return;
          145                 }
          146                 emit(ESC); emit('$'); state = state0; goto again;
          147 
          148         case state3:        /* may be shifting out of JIS */
          149                 if((c == 'J') || (c == 'H') || (c == 'B')){
          150                         japan646 = (c == 'J');
          151                         set8 = 0; state = state0; return;
          152                 }
          153                 emit(ESC); emit('('); state = state0; goto again;
          154 
          155         case state4:        /* two part char */
          156                 if(c < 0){
          157                         if(squawk)
          158                                 EPR "%s: unexpected EOF in %s\n", argv0, file);
          159                         c = 0x21 | (lastc&0x80);
          160                 }
          161                 if(CANS2J(lastc, c)){        /* ms dos sjis */
          162                         int hi = lastc, lo = c;
          163                         S2J(hi, lo);                        /* convert to 208 */
          164                         n = hi*100 + lo - 3232;                /* convert to kuten208 */
          165                 } else {
          166                         nerrors++;
          167                         if(squawk)
          168                                 EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
          169                         if(!clean)
          170                                 emit(BADMAP);
          171                         state = state0;
          172                         goto again;
          173                 }
          174                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
          175                         nerrors++;
          176                         if(squawk)
          177                                 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
          178                         if(!clean)
          179                                 emit(BADMAP);
          180                 } else {
          181                         if(l < 0){
          182                                 l = -l;
          183                                 if(squawk)
          184                                         EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
          185                         }
          186                         emit(l);
          187                 }
          188                 state = state0;
          189         }
          190 }
          191 
          192 /*
          193         a state machine for interpreting ujis == EUC
          194 */
          195 static void
          196 ujis(int c, Rune **r, long input_loc)
          197 {
          198         static enum { state0, state1 } state = state0;
          199         static int lastc;
          200         int n;
          201         long l;
          202 
          203         switch(state)
          204         {
          205         case state0:        /* idle state */
          206                 if(c < 0) return;
          207                 if(c < 128){
          208                         emit(c);
          209                         return;
          210                 }
          211                 if(c == 0x8e){        /* codeset 2 */
          212                         nerrors++;
          213                         if(squawk)
          214                                 EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
          215                         if(!clean)
          216                                 emit(BADMAP);
          217                         return;
          218                 }
          219                 if(c == 0x8f){        /* codeset 3 */
          220                         nerrors++;
          221                         if(squawk)
          222                                 EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
          223                         if(!clean)
          224                                 emit(BADMAP);
          225                         return;
          226                 }
          227                 lastc = c;
          228                 state = state1;
          229                 return;
          230 
          231         case state1:        /* two part char */
          232                 if(c < 0){
          233                         if(squawk)
          234                                 EPR "%s: unexpected EOF in %s\n", argv0, file);
          235                         c = 0xA1;
          236                 }
          237                 n = (lastc&0x7F)*100 + (c&0x7F) - 3232;        /* kuten208 */
          238                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
          239                         nerrors++;
          240                         if(squawk)
          241                                 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
          242                         if(!clean)
          243                                 emit(BADMAP);
          244                 } else {
          245                         if(l < 0){
          246                                 l = -l;
          247                                 if(squawk)
          248                                         EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
          249                         }
          250                         emit(l);
          251                 }
          252                 state = state0;
          253         }
          254 }
          255 
          256 /*
          257         a state machine for interpreting jis-kanji == 2022-JP
          258 */
          259 static void
          260 jis(int c, Rune **r, long input_loc)
          261 {
          262         static enum { state0, state1, state2, state3, state4 } state = state0;
          263         static int set8 = 0;
          264         static int japan646 = 0;
          265         static int lastc;
          266         int n;
          267         long l;
          268 
          269 again:
          270         switch(state)
          271         {
          272         case state0:        /* idle state */
          273                 if(c == ESC){ state = state1; return; }
          274                 if(c < 0) return;
          275                 if(!set8 && (c < 128)){
          276                         if(japan646){
          277                                 switch(c)
          278                                 {
          279                                 case '\\':        emit(0xA5); return;        /* yen */
          280                                 case '~':        emit(0xAF); return;        /* spacing macron */
          281                                 default:        emit(c); return;
          282                                 }
          283                         } else {
          284                                 emit(c);
          285                                 return;
          286                         }
          287                 }
          288                 lastc = c; state = state4; return;
          289 
          290         case state1:        /* seen an escape */
          291                 if(c == '$'){ state = state2; return; }
          292                 if(c == '('){ state = state3; return; }
          293                 emit(ESC); state = state0; goto again;
          294 
          295         case state2:        /* may be shifting into JIS */
          296                 if((c == '@') || (c == 'B')){
          297                         set8 = 1; state = state0; return;
          298                 }
          299                 emit(ESC); emit('$'); state = state0; goto again;
          300 
          301         case state3:        /* may be shifting out of JIS */
          302                 if((c == 'J') || (c == 'H') || (c == 'B')){
          303                         japan646 = (c == 'J');
          304                         set8 = 0; state = state0; return;
          305                 }
          306                 emit(ESC); emit('('); state = state0; goto again;
          307 
          308         case state4:        /* two part char */
          309                 if(c < 0){
          310                         if(squawk)
          311                                 EPR "%s: unexpected EOF in %s\n", argv0, file);
          312                         c = 0x21 | (lastc&0x80);
          313                 }
          314                 if((lastc&0x80) != (c&0x80)){        /* guard against latin1 in jis */
          315                         emit(lastc);
          316                         state = state0;
          317                         goto again;
          318                 }
          319                 n = (lastc&0x7F)*100 + (c&0x7f) - 3232;        /* kuten208 */
          320                 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
          321                         nerrors++;
          322                         if(squawk)
          323                                 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
          324                         if(!clean)
          325                                 emit(BADMAP);
          326                 } else {
          327                         if(l < 0){
          328                                 l = -l;
          329                                 if(squawk)
          330                                         EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
          331                         }
          332                         emit(l);
          333                 }
          334                 state = state0;
          335         }
          336 }
          337 
          338 static void
          339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
          340 {
          341         Rune ob[N];
          342         Rune *r, *re;
          343         uchar ibuf[N];
          344         int n, i;
          345         long nin;
          346 
          347         r = ob;
          348         re = ob+N-3;
          349         nin = 0;
          350         while((n = read(fd, ibuf, sizeof ibuf)) > 0){
          351                 for(i = 0; i < n; i++){
          352                         (*procfn)(ibuf[i], &r, nin++);
          353                         if(r >= re){
          354                                 OUT(out, ob, r-ob);
          355                                 r = ob;
          356                         }
          357                 }
          358                 if(r > ob){
          359                         OUT(out, ob, r-ob);
          360                         r = ob;
          361                 }
          362         }
          363         (*procfn)(-1, &r, nin);
          364         if(r > ob)
          365                 OUT(out, ob, r-ob);
          366         OUT(out, ob, 0);
          367 }
          368 
          369 void
          370 jis_in(int fd, long *notused, struct convert *out)
          371 {
          372         USED(notused);
          373         do_in(fd, alljis, out);
          374 }
          375 
          376 void
          377 ujis_in(int fd, long *notused, struct convert *out)
          378 {
          379         USED(notused);
          380         do_in(fd, ujis, out);
          381 }
          382 
          383 void
          384 msjis_in(int fd, long *notused, struct convert *out)
          385 {
          386         USED(notused);
          387         do_in(fd, ms, out);
          388 }
          389 
          390 void
          391 jisjis_in(int fd, long *notused, struct convert *out)
          392 {
          393         USED(notused);
          394         do_in(fd, jis, out);
          395 }
          396 
          397 static int first = 1;
          398 
          399 static void
          400 tab_init(void)
          401 {
          402         int i;
          403         long l;
          404 
          405         first = 0;
          406         for(i = 0; i < NRUNE; i++)
          407                 tab[i] = -1;
          408         for(i = 0; i < KUTEN208MAX; i++)
          409                 if((l = tabkuten208[i]) != -1){
          410                         if(l < 0)
          411                                 tab[-l] = i;
          412                         else
          413                                 tab[l] = i;
          414                 }
          415 }
          416 
          417 
          418 /*        jis-kanji, or ISO 2022-JP        */
          419 void
          420 jisjis_out(Rune *base, int n, long *notused)
          421 {
          422         char *p;
          423         int i;
          424         Rune r;
          425         static enum { ascii, japan646, jp2022 } state = ascii;
          426 
          427         USED(notused);
          428         if(first)
          429                 tab_init();
          430         nrunes += n;
          431         p = obuf;
          432         for(i = 0; i < n; i++){
          433                 r = base[i];
          434                 if(r < 128){
          435                         if(state == jp2022){
          436                                 *p++ = ESC; *p++ = '('; *p++ = 'B';
          437                                 state = ascii;
          438                         }
          439                         *p++ = r;
          440                 } else {
          441                         if(tab[r] != -1){
          442                                 if(state != jp2022){
          443                                         *p++ = ESC; *p++ = '$'; *p++ = 'B';
          444                                         state = jp2022;
          445                                 }
          446                                 *p++ = tab[r]/100 + ' ';
          447                                 *p++ = tab[r]%100 + ' ';
          448                                 continue;
          449                         }
          450                         if(squawk)
          451                                 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
          452                         nerrors++;
          453                         if(clean)
          454                                 continue;
          455                         *p++ = BYTEBADMAP;
          456                 }
          457         }
          458         noutput += p-obuf;
          459         if(p > obuf)
          460                 write(1, obuf, p-obuf);
          461 }
          462 
          463 /*        ms-kanji, or Shift-JIS        */
          464 void
          465 msjis_out(Rune *base, int n, long *notused)
          466 {
          467         char *p;
          468         int i, hi, lo;
          469         Rune r;
          470 
          471         USED(notused);
          472         if(first)
          473                 tab_init();
          474         nrunes += n;
          475         p = obuf;
          476         for(i = 0; i < n; i++){
          477                 r = base[i];
          478                 if(r < 128)
          479                         *p++ = r;
          480                 else {
          481                         if(tab[r] != -1){
          482                                 hi = tab[r]/100 + ' ';
          483                                 lo = tab[r]%100 + ' ';
          484                                 J2S(hi, lo);
          485                                 *p++ = hi;
          486                                 *p++ = lo;
          487                                 continue;
          488                         }
          489                         if(squawk)
          490                                 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
          491                         nerrors++;
          492                         if(clean)
          493                                 continue;
          494                         *p++ = BYTEBADMAP;
          495                 }
          496         }
          497         noutput += p-obuf;
          498         if(p > obuf)
          499                 write(1, obuf, p-obuf);
          500 }
          501 
          502 /*        ujis, or EUC        */
          503 void
          504 ujis_out(Rune *base, int n, long *notused)
          505 {
          506         char *p;
          507         int i;
          508         Rune r;
          509 
          510         USED(notused);
          511         if(first)
          512                 tab_init();
          513         nrunes += n;
          514         p = obuf;
          515         for(i = 0; i < n; i++){
          516                 r = base[i];
          517                 if(r < 128)
          518                         *p++ = r;
          519                 else {
          520                         if(tab[r] != -1){
          521                                 *p++ = 0x80 | (tab[r]/100 + ' ');
          522                                 *p++ = 0x80 | (tab[r]%100 + ' ');
          523                                 continue;
          524                         }
          525                         if(squawk)
          526                                 EPR "%s: rune 0x%x not in output cs\n", argv0, r);
          527                         nerrors++;
          528                         if(clean)
          529                                 continue;
          530                         *p++ = BYTEBADMAP;
          531                 }
          532         }
          533         noutput += p-obuf;
          534         if(p > obuf)
          535                 write(1, obuf, p-obuf);
          536 }