/* ** Unicode Bidirectional Algorithm ** Reference Implementation ** Copyright 2023 Unicode, Inc. All rights reserved. ** Unpublished rights reserved under U.S. copyright laws. */ /* * brutils.c * * Collection of utility functions used by other modules. * These utility functions are used both by modules inside * the library and by functions in the bidiref executable. * * Exports: * GetFileFormat * SetFileFormat * * GetUBAVersion * GetUBAVersionStr * SetUBAVarsion * * Trace * TraceOn * TraceOff * TraceOffAll * * convertHexToInt * br_Evaluate * copyField * copySubField * br_GetBCFromLabel * br_ErrPrint */ /* * Change history: * * 2013-Jun-01 kenw Created. * 2013-Jun-19 kenw Add br_ErrPrint. * 2016-Sep-22 kenw Add cast to suppress strlen warning. * 2016-Oct-05 kenw Add GetUBAVersionStr. * 2017-Jun-27 kenw Updated for Unicode 10.0. * 2018-Jul-22 kenw Updated for Unicode 11.0. * 2019-Nov-19 kenw Updated for Unicode 12.0. * 2020-Mar-26 kenw Updated for Unicode 13.0. * 2021-Sep-23 kenw Updated for Unicode 14.0. * 2022-Sep-30 kenw Updated for Unicode 15.0. * 2023-Oct-03 kenw Updated for Unicode 15.1. */ #include #include #include #include #include #include "bidirefp.h" /* * SECTION: Generic version information. */ /* * MAINTENANCE NOTE: Update the ubaVersion to the latest defined * UBA version. */ static int ubaVersion = UBA151; /* Default to UBA151 */ static int fileFormat = FORMAT_A; /* Format of input data file */ /* * MAINTENANCE NOTE: When adding a new UBA version to the code, * add an explicit string for it in the following array, and mark * that as the default. */ static char *versionNums[NUMVERSIONS] = { "X.X", "6.2", "6.3", "7.0", "8.0", "9.0", "10.0", "11.0", "12.0", "13.0", "14.0", "15.0", "15.1", "def:15.1" }; int GetFileFormat ( void ) { return ( fileFormat ); } void SetFileFormat ( int format ) { fileFormat = format; } int GetUBAVersion ( void ) { return ( ubaVersion ); } char* GetUBAVersionStr ( void ) { return ( versionNums[ubaVersion] ); } /* * SetUBAVersion() * * Set ubaVersion to the version passed in. * * UBACUR (unspecified) will default to latest version. */ void SetUBAVersion ( int version ) { ubaVersion = version; } /***********************************************************/ /* * SECTION: Traces. * * This set of routines supports a very simple set of trace * flags, with set, unset, and check functions, to enable * precise control over what types of debug output are * printed during execution. * * One 32-bit traceFlags variable is defined here, which * allows up to 32 individual trace bits to be defined for * the flags. * * The actual values of the trace flags are established * in bidiref.h. * * Initial default trace flag settings are established * based on chosen debug levels during program start, but * can be changed programmatically anytime during execution. */ static U_Int_32 traceFlags = 0; /* Trace flags for control of debug output */ /* * Trace: check a trace bit */ int Trace ( U_Int_32 traceValue ) { return ( ( ( traceValue & traceFlags ) == traceValue ) ? 1 : 0 ); } /* * TraceOn: turn on a trace bit. */ void TraceOn ( U_Int_32 traceValue ) { traceFlags |= traceValue; } /* * TraceOff: turn off a trace bit. */ void TraceOff ( U_Int_32 traceValue ) { traceFlags &= ~traceValue; } /* * TraceOffAll: turn off all trace bits. */ void TraceOffAll ( void ) { traceFlags = 0; } /***********************************************************/ /* * SECTION: Error Printing */ /* * br_ErrPrint * * Encapsulate console error logging, so it can be turned on * or off simply with a trace flag. */ void br_ErrPrint ( char *errString ) { if ( Trace ( Trace15 ) ) { printf ( errString ); } } /***********************************************************/ /* * SECTION: Numeric evaluation utility routines. */ /* * Convert an "ASCII" representation of sb/mb int to real int. * * Bounded to 0 .. 0xFFFFFFFF. * * i.e. "c2ba" to 0xC2BA, "8000000A" to 0x8000000A, etc. * * RETURNS: * 0 converted o.k. * -1 input string too long to convert or too short to be valid. * -2 non-hex digit input. */ int convertHexToInt ( U_Int_32 *dest, char *source ) { int slen; int i; char *sp; char b1; unsigned char n1; slen = (int) strlen ( source ); if ( ( slen > 8 ) || ( slen < 4 ) ) { return ( -1 ); } sp = source; *dest = 0; for ( i = 0; i < slen; i++ ) { *dest <<= 4; b1 = (char)toupper(*sp++); if ( ( b1 >= 0x41 ) && ( b1 <= 0x46 ) ) { n1 = (unsigned char)(b1 - 55); } else if ( ( b1 >= 0x30 ) && ( b1 <= 0x39 ) ) { n1 = (unsigned char)(b1 - 48); } else { return ( -2 ); } *dest |= n1; } return 0 ; } /* * br_Evaluate * * Use the error checking strtol() function, rather * that atoi(), to provide a modicum of validation for * bad data in the test case file. * * This routine does not bother checking underflow * and overflow cases or the cast, because it is far more likely * that typos or missing data will be the source of * the problem, rather than monstrously large numbers. */ int br_Evaluate ( char *data, int *outval ) { char *endptr; long val; errno = 0; val = strtol ( data, &endptr, 10 ); if ( ( errno != 0 ) && ( val == 0 ) ) { *outval = 0; return ( -1 ); } if ( endptr == data ) { *outval = 0; return ( -1 ); } *outval = (int)val; return ( 1 ); } /***********************************************************/ /* * SECTION: Field parsing utility routines. */ #define DELIM ';' /* * copyField * * Copy one semicolon-delimited field from a line parsed from * a UCD-style property file. * * This routine does not stop on spaces, but *does* stop on * EOLN characters, which are not trimmed from lines by * the generic line parsers. * * This routine, like all others in this set of utilities, * returns a pointer to the start of the *next* field in * in the src string, so it can be called repeated to parse * out multiple fields. * * Check for s = "" or *s = '\0' as an end of input condition. */ char *copyField ( char *dest, char *src ) { char *sp; char *dp; sp = src; dp = dest; while ( (*sp != DELIM) && (*sp != '\0') && (*sp != '\n') ) { *dp++ = *sp++; } *dp = '\0'; if ( ( *sp == DELIM ) || ( *sp == '\n') ) { sp++; } return sp; } /* * copySubField * * As for copyField, but copies one space-delimited subfield * out of a multi-value field. * * This is used, for instance, to parse out a sequence of * levels, one at a time. * * This routine does not check for EOLN, because it assumes * the src string is itself an already-parsed null-delimited * string. */ char *copySubField ( char *dest, char *src ) { char *sp; char *dp; sp = src; dp = dest; while ( (*sp != ' ') && (*sp != '\0') ) { *dp++ = *sp++; } *dp = '\0'; if ( ( *sp == ' ' ) ) { sp++; } return sp; } /***********************************************************/ /* * SECTION: Property alias parsing utility routines. */ /* * br_GetBCFromLabel * * Get a Bidi_Class enumerated value from a string parsed * from a bidi field in a test case, from UnicodeData.txt, * or some other source. This supports turning the * symbolic alias (assumed here to be expressed in * uppercase, as in UnicodeData.txt) into a defined * enumerated value, for use in further processing. */ int br_GetBCFromLabel ( char* src ) { if ( ( strlen ( src ) > 3 ) || ( strlen ( src ) < 1 ) ) { return ( BIDI_Unknown ); } switch ( src[0] ) { case 'L': if ( strlen ( src ) == 1 ) return ( BIDI_L ); else if ( strcmp ( src, "LRE" ) == 0 ) return ( BIDI_LRE ); else if ( strcmp ( src, "LRO" ) == 0 ) return ( BIDI_LRO ); else if ( strcmp ( src, "LRI" ) == 0 ) return ( BIDI_LRI ); else return ( BIDI_Unknown ); case 'R': if ( strlen ( src ) == 1 ) return ( BIDI_R ); else if ( strcmp ( src, "RLE" ) == 0 ) return ( BIDI_RLE ); else if ( strcmp ( src, "RLO" ) == 0 ) return ( BIDI_RLO ); else if ( strcmp ( src, "RLI" ) == 0 ) return ( BIDI_RLI ); else return ( BIDI_Unknown ); case 'B': if ( strlen ( src ) == 1 ) return ( BIDI_B ); else if ( strcmp ( src, "BN" ) == 0 ) return ( BIDI_BN ); else return ( BIDI_Unknown ); case 'S': return ( BIDI_S ); case 'P': if ( strcmp ( src, "PDF" ) == 0 ) return ( BIDI_PDF ); if ( strcmp ( src, "PDI" ) == 0 ) return ( BIDI_PDI ); else return ( BIDI_Unknown ); case 'N': if ( strcmp ( src, "NSM" ) == 0 ) return ( BIDI_NSM ); else return ( BIDI_Unknown ); case 'F': if ( strcmp ( src, "FSI" ) == 0 ) return ( BIDI_FSI ); else return ( BIDI_Unknown ); case 'E': switch ( src[1] ) { case 'N' : return ( BIDI_EN ); case 'S' : return ( BIDI_ES ); case 'T' : return ( BIDI_ET ); default : return ( BIDI_Unknown ); } case 'A': switch ( src[1] ) { case 'N' : return ( BIDI_AN ); case 'L' : return ( BIDI_AL ); default : return ( BIDI_Unknown ); } case 'C': switch ( src[1] ) { case 'S' : return ( BIDI_CS ); default : return ( BIDI_Unknown ); } case 'W': switch ( src[1] ) { case 'S' : return ( BIDI_WS ); default : return ( BIDI_Unknown ); } case 'O': switch ( src[1] ) { case 'N' : return ( BIDI_ON ); default : return ( BIDI_Unknown ); } default : return ( BIDI_Unknown ); } } .