cc1: Add correct handling of universal character names - scc - simple c99 compiler
HTML git clone git://git.simple-cc.org/scc
DIR Log
DIR Files
DIR Refs
DIR Submodules
DIR README
DIR LICENSE
---
DIR commit f29cc7e4e4ff9756684411fadfca18f3ff6556d2
DIR parent 3103a4ad89edc64ed22bd66de2a49b15a7e43d1b
HTML Author: Roberto E. Vargas Caballero <k0ga@shike2.net>
Date: Tue, 7 Apr 2026 09:22:21 +0200
cc1: Add correct handling of universal character names
This commits adds the support only in character constants, because
we think that adding the support for identifiers is worthless and
it makes more sense to support (or accept in a careless way) utf8
identifiers. The support for universal character constants in
strings is coming.
Diffstat:
M src/cmd/scc-cc/cc1/lex.c | 56 +++++++++++++++++++++++--------
A tests/cc/execute/0259-multichar.c | 15 +++++++++++++++
M tests/cc/execute/scc-tests.lst | 1 +
3 files changed, 58 insertions(+), 14 deletions(-)
---
DIR diff --git a/src/cmd/scc-cc/cc1/lex.c b/src/cmd/scc-cc/cc1/lex.c
@@ -11,6 +11,8 @@
#include <scc/scc.h>
#include "cc1.h"
+#define REPLACECHAR 0xFFFD
+
int yytoken;
struct yystype yylval;
char yytext[STRINGSIZ+3];
@@ -516,10 +518,11 @@ number(void)
return CONSTANT;
}
-static int
-escape(void)
+static Rune
+escape(int multi)
{
- int c, d, i, cnt, base;
+ Rune c;
+ int uni, d, i, cnt, base;
switch (*++input->p) {
case 'a':
@@ -544,17 +547,21 @@ escape(void)
return '\\';
case '\?':
return '\?';
+ case 'U':
+ cnt = 8;
+ goto check_universal;
case 'u':
- /*
- * FIXME: universal constants are not correctly handled
- */
- if (!isdigit(*++input->p))
- warn("incorrect digit for numerical character constant");
- base = 10;
+ cnt = 4;
+ check_universal:
+ if (!multi)
+ warn("multi-character character constant");
+ ++input->p;
+ uni = 1;
+ base = 16;
break;
case 'x':
- if (!isxdigit(*++input->p))
- warn("\\x used with no following hex digits");
+ ++input->p;
+ uni = 0;
cnt = 2;
base = 16;
break;
@@ -566,6 +573,7 @@ escape(void)
case '5':
case '6':
case '7':
+ uni = 0;
cnt = 3;
base = 8;
break;
@@ -574,7 +582,7 @@ escape(void)
return ' ';
}
- for (c = i = 0; i < cnt; ++i) {
+ for (c = i = 0; i < cnt && isxdigit(*input->p); ++i) {
static char digits[] = "0123456789ABCDEF";
char *p = strchr(digits, toupper(*input->p));
@@ -586,6 +594,26 @@ escape(void)
}
--input->p;
+ if (base == 16 && i != cnt) {
+ if (uni) {
+ warn("incorrect digit for universal character constant");
+ c = REPLACECHAR;
+ } else {
+ warn("\\x used with no following hex digits");
+ c = ' ';
+ }
+ }
+
+ if (!uni)
+ return c;
+
+ if (c < 0xa0 && (c == 0x24 || c == 0x40 || c == 0x60)
+ || c >= 0xD800 && c <= 0xDFFF
+ || c >= 0x110000) {
+ warn("invalid universal character constant");
+ c = REPLACECHAR;
+ }
+
return c;
}
@@ -626,7 +654,7 @@ decode(int multi)
Rune r;
if (*input->p == '\\') {
- r = escape();
+ r = escape(multi);
return r;
}
@@ -713,7 +741,7 @@ string(void)
esc = (c == '\\' && !esc && disescape);
if (c == '\\' && !esc)
- c = escape();
+ c = escape(0);
if (bp == &yytext[STRINGSIZ+1]) {
/* too long, ignore everything until next quote */
DIR diff --git a/tests/cc/execute/0259-multichar.c b/tests/cc/execute/0259-multichar.c
@@ -0,0 +1,15 @@
+
+int
+main(void)
+{
+ int a = L'\u00a1';
+
+ if (a != 0xa1)
+ return 1;
+ if (L'\u00a1' != 0xa1)
+ return 2;
+ if (L'\U000000b1' != 0xb1)
+ return 3;
+
+ return 0;
+}
DIR diff --git a/tests/cc/execute/scc-tests.lst b/tests/cc/execute/scc-tests.lst
@@ -249,3 +249,4 @@
0256-ary.c
0257-main.c
0258-variadic.c
+0259-multichar.c