/* ** Unicode Bidirectional Algorithm ** Reference Implementation ** Copyright 2015, Unicode, Inc. All rights reserved. ** Unpublished rights reserved under U.S. copyright laws. */ /* * bidirefp.h * * Private types, constants, and declarations used by bidiref. */ /* * Change history: * * 2013-Jun-02 kenw Created. * 2013-Jun-05 kenw Added diretyBit to UBACONTEXT. * 2013-Jun-19 kenw Add declaration for br_ErrPrint. * 2013-Jul-08 kenw Move BR_MAXINPUTLEN to bidiref.h. * 2014-Apr-17 kenw Add accelerator flags to BIDIUNIT. * 2015-Jun-05 kenw Set MAXPAIRINGDEPTH to 63 for UBA 8.0. * 2015-Aug-19 kenw Add CJK Extension E range. * 2015-Dec-04 kenw Bumped up bracket stack size by one. */ #ifndef __BIDIREFP_LOADED #define __BIDIREFP_LOADED #ifdef __cplusplus extern "C" { #endif #include "bidiref.h" /* * Constants defining Unihan ranges. */ #define CJK_URO_FIRST (0x4E00) #define CJK_URO_LAST (0x9FCC) #define CJK_EXTA_FIRST (0x3400) #define CJK_EXTA_LAST (0x4DB5) #define CJK_EXTB_FIRST (0x20000) #define CJK_EXTB_LAST (0x2A6D6) #define CJK_EXTC_FIRST (0x2A700) #define CJK_EXTC_LAST (0x2B734) #define CJK_EXTD_FIRST (0x2B740) #define CJK_EXTD_LAST (0x2B81D) #define CJK_EXTE_FIRST (0x2B820) #define CJK_EXTE_LAST (0x2CEA1) /* * Enumerated types for relevant Unicode properties, * to make reference to them easier to understand in the algorithm. */ typedef enum { BIDI_None, BIDI_Unknown, BIDI_L, /* 0x00 strong: left-to-right (bc=L) */ BIDI_ON, /* 0x01 neutral: Other Neutral (bc=ON) */ BIDI_R, /* 0x02 strong: right-to-left (bc=R) */ BIDI_EN, /* 0x03 weak: European Number (bc=EN) */ BIDI_ES, /* 0x04 weak: European Number Separator (bc=ES) */ BIDI_ET, /* 0x05 weak: European Number Terminator (bc=ET) */ BIDI_AN, /* 0x06 weak: Arabic Number (bc=AN) */ BIDI_CS, /* 0x07 weak: Common Number Separator (bc=CS) */ BIDI_B, /* 0x08 neutral: Paragraph Separator (bc=B) */ BIDI_S, /* 0x09 neutral: Segment Separator (bc=S) */ BIDI_WS, /* 0x0A neutral: White Space (bc=WS) */ BIDI_AL, /* 0x0B strong: right-to-left Arabic (bc=AL) */ BIDI_NSM, /* 0x0C weak: non-spacing mark (bc=NSM) */ BIDI_BN, /* 0x0D weak: boundary neutral (bc=BN) */ BIDI_PDF, /* 0x0E format: pop directional formatting (bc=PDF) */ BIDI_LRE, /* format: left-to-right embedding */ BIDI_LRO, /* format: left-to-right override */ BIDI_RLE, /* format: right-to-left embedding */ BIDI_RLO, /* format: right-to-left override */ BIDI_LRI, /* format: left-to-right isolate */ BIDI_RLI, /* format: right-to-left isolate */ BIDI_FSI, /* format: first strong isolate */ BIDI_PDI, /* format: pop directional isolate */ BIDI_MAX } BIDIPROP; typedef enum { BPT_O, BPT_C, BPT_None } BPTPROP; /* * File format defines. * * These defines can be used to distinguish between formats * of data to be read in. They are used to branch the * parsing functions in brinput.c. */ #define FORMAT_A (0) /* Format used in BidiCharacterTest.txt */ #define FORMAT_B (1) /* Format used in BidiTest.txt */ /* * ALGORITHM_STATE is used to store how far along in * algorithm the data has been processed. This can be * used to help the debug display to show relevant * information. */ typedef enum { State_Unitialized, /* context allocated, but no data */ State_Initialized, /* test case data read in and parsed */ State_P3Done, /* rules done through P3 */ State_X8Done, /* rules done through X8 */ State_X9Done, /* rules done through X9 */ State_RunsDone, /* rules done through X10 part 1: runs are identified */ State_X10Done, /* rules done through X10: runs & seqs are identified */ State_W1Done, /* rules done through W1: combining marks are resolved */ State_W2Done, /* rules done through W2: */ State_W3Done, /* rules done through W3: */ State_W4Done, /* rules done through W4: */ State_W5Done, /* rules done through W5: */ State_W6Done, /* rules done through W6: */ State_W7Done, /* rules done through W7: weak types are resolved */ State_N0Done, /* rules done through N0: bracket pairs are resolved */ State_N1Done, /* rules done through N1: */ State_N2Done, /* rules done through N2: neutral types are resolved */ State_I2Done, /* rules done through I2: implicit levels are resolved */ State_L1Done, /* rules done through L1: trailing whitespace resolved */ State_L2Done, /* rules done through L2: reordering data available */ State_Complete /* finished application of rules: ready for checking */ } ALGORITHM_STATE; /* * BIDIPROPDATA * * This struct is used to store property information * about characters and code points. * * Only limited property data is stored -- just the data * needed for running the UBA algorithm. */ typedef struct { int han; BIDIPROP bidivalue; U_Int_32 bpb; BPTPROP bpt; } BIDIDATA; typedef BIDIDATA *BDPTR; #define BPB_None (0xFFFFFFFF) typedef enum { Dir_LTR, Dir_RTL, Dir_Auto, Dir_Unknown } Paragraph_Direction; typedef enum { Override_Neutral, Override_LTR, Override_RTL } D_Override_Status; /* * A single status stack definition is shared by both the * UBA62 and UBA63 reference implementations. The * UBA62 implementation makes no use of the isolate_status * field. * * The status stack is used by rules X1-X8. */ typedef struct { int embedding_level; D_Override_Status override_status; /* direction */ int isolate_status; /* boolean */ } STATUSSTACKELEMENT; typedef STATUSSTACKELEMENT *STACKPTR; /* * The maximum_depth for embedding for UBA62. */ #define MAX_DEPTH_62 61 /* * The maximum_depth for embedding for UBA63. */ #define MAX_DEPTH_63 125 /* * Typedefs used in the bracket pairing algorithm in rule N0. */ typedef struct { U_Int_32 bracket; int pos; } BRACKETSTACKELEMENT; typedef BRACKETSTACKELEMENT *BRACKETSTACKPTR; typedef struct Pairing_Element_Struct { struct Pairing_Element_Struct *next; int openingpos; int closingpos; } PAIRINGELEMENT; typedef PAIRINGELEMENT *PAIRINGPTR; /* * MAXPAIRINGDEPTH is used to declare the bracket pairing stack. * * For UBA 8.0 and subsequent versions, this depth is specified * to be exactly 63. (See brrule.c, where the stack allocation is * made as MAXPAIRINGDEPTH +1.) * * For UBA 6.3 (and 7.0), these depth was implementation-dependent, * and not specified by UAX #9. For the 6.3 and 7.0 versions of * bidiref it was set to half the MAXINPUTLEN, because in principle an * input string could consist *only* of brackets to pair up: "(({{[]}}))", * etc., in which case the maximum possible set of pairs would be * half the text length. * * Starting with the 8.0 version of bidiref, however, 63 is used * also for testing the 6.3 and 7.0 versions of UBA, and for * consistency, the stack full error handling is also based * on the 8.0 specification. */ #define MAXPAIRINGDEPTH (63) /* * Provide a reasonable length for input-handling buffers, based * on MAXINPUTLEN, which limits how many code points the * implementation will handle for an input string. */ #define BUFFERLEN (5 * BR_MAXINPUTLEN) /* for data chunks parsed from input */ #define RAWBUFFERLEN (2 * BUFFERLEN) /* full unparsed input line */ /* * BIDIUNIT * * This struct is the primitive manipulated by * the UBA reference implementation. It is used * to construct a vector in the UBACONTEXT, * containing the character data (stored as UTF-32), * the looked-up property data, so that the UBA, * which works primarily on the basis of the Bidi_Class * values, has the correct input, along with the * level and order information. * * Starting with Version 2.0 of bidiref, a number of * accelerator flags are added, to speed up checking of * common combinations of Bidi_Class values. Each flag * is conceptually a Boolean, and could be stored as a bit * in a bitfield, but in this reference implementation * is simply stored as another int field in the BIDIUNIT. * These values are all initialized when the BIDIUNIT * vector is initialized for a testcase string. Some need * to be reset, whenever a rule changes the current bc * value for a character. */ typedef struct { U_Int_32 c; /* character value, stored as UTF-32 */ BIDIPROP bc; /* current Bidi_Class value */ BIDIPROP orig_bc; /* store original value for Bidi_Class */ int bc_numeric; /* accelerator for bc = (AN or EN) */ int bc_isoinit; /* accelerator for bc = (LRI or RLI or FSI) */ U_Int_32 bpb; /* bidi paired bracket property */ BPTPROP bpt; /* bidi paired bracket type property */ int level; /* current embedding level */ int expLevel; /* store expected level for test case */ int order; /* store position for reordering */ int order2; /* spare order array used in reversal algorithm */ } BIDIUNIT; typedef BIDIUNIT *BIDIUNITPTR; #define NOLEVEL (-1) /* * A text chain is simply an array of pointers to BIDIUNITs. * * This data structure is introduced to help in the abstraction * of rule application between UBA62 (and earlier), where * rules apply to level runs, and UBA63 (and later), where * rules apply to isolating run sequences. Because * isolating run sequences may contain indefinite lists * of discontiguous level runs, it is difficult to * specify rule application directly to the isolating * run sequences -- the concept of previous character * and next character in the sequence gets rather complex * and makes the implementation difficult. * * One possible approach to the isolating run sequences * would be to simply clone the entire text into * allocated contiguous BIDIUNIT vectors for each * isolating run sequence identified. However, that makes * bookkeeping in the original input structure a bit more * complex. Instead, whenever an isolating run sequence * is identified, a contiguous text chain is allocated * and initialized instead. This keeps all the data * in the original input structure, and simply embodies * the traversal order (and prior and next relation) * needed for rule application. * * So that the rule application can be uniform, a * text chain is also allocated for level runs, as well. * * The text chain is hung off the "textChain" field in * both the BIDIRUN and the ISOLATING_RUN_SEQUENCE * struct definitions. The "len" field defines the * length of that array. */ /* * BIDIRUN * * Once embedding levels are determined, the UBA * treats each contiguous sequence at the same level * as a distinct run. * * To simplify the processing of runs, a list of runs * is constructed as a linked list, which hangs off * the UBACONTEXT. * * Each BIDIRUN consists of pointers to the first * and last BIDIUNIT of the run, and then additional * information calculated during the X10 rule, when * the runs are identified. * * This may not be the most efficient approach to * an implementation, but it makes it much easier * to express the X10 rule and subsequent rules which * process the runs individually. * * The seqID is only used in processing run lists * into isolating run sequence lists in UBA63. * It is initialized to zero. During processing * to identify isolating run sequences, it is set * to the sequence id of the isolating run sequence * that a run is assigned to. */ typedef struct BidiRunStruct { struct BidiRunStruct *next; /* next run */ BIDIUNITPTR first; /* first element of this run */ BIDIUNITPTR last; /* last element of this run */ int len; /* explicit len, to simplify processing */ int runID; /* stamp run with id for debugging */ int seqID; /* isolating run sequence id */ int level; /* embedding level of this run */ BIDIPROP sor; /* direction of start of run: L or R */ BIDIPROP eor; /* direction of end of run: L or R */ BIDIUNITPTR *textChain; /* constructed text chain */ } BIDIRUN; typedef BIDIRUN *BIDIRUNPTR; /* * The BidiRunListStruct abstracts the creation of * a list of bidi runs for attachment to the isolating * run lists. Instead of duplicating all the run information * into that list, the list consists just of pointers to * the already allocated basic list of runs. */ typedef struct BidiRunListStruct { struct BidiRunListStruct *next; BIDIRUNPTR run; } BIDIRUNLISTELEMENT; typedef BIDIRUNLISTELEMENT *BIDIRUNLISTPTR; /* * ISOLATING_RUN_SEQUENCE * * This is a concept introduced in UBA63. * * Essentially it consists of an ordered sequence of * bidi runs, as defined in BD13. * * It is implemented here by attaching the list of runs * associated with this particular isolating run sequence. * The attached list just contains pointers to each of * the relevant BIDIRUN structs in the already constructed * sequential list of runs attached to the UBACONTEXT. * * All runs associated with a single isolating run sequence * are at the same level, so that level can be stored * in the isolating run sequence struct for ease of access. * * Each isolating run sequence has a start of sequence (sos) * and end of sequence (eos) directional value assigned. * These are calculated based on the sor and eor values * for the associated list of runs, but again, are stored * in the isolating run sequence struct for ease of access. */ typedef struct IRSequenceStruct { struct IRSequenceStruct *next; /* next sequence */ BIDIRUNLISTPTR theRuns; /* list of runs in this sequence */ BIDIRUNLISTPTR lastRun; /* for list appending */ int len; /* explicit len, to simplify processing */ int seqID; /* stamp seq with id for debugging */ int level; /* embedding level of this seq */ BIDIPROP sos; /* direction of start of seq: L or R */ BIDIPROP eos; /* directino of end of seq: L or R */ BIDIUNITPTR *textChain; /* constructed text chain */ } ISOLATING_RUN_SEQUENCE; typedef ISOLATING_RUN_SEQUENCE *IRSEQPTR; /* * UBACONTEXT * * This struct is used to store all context associated * with the bidi reference UBA processing, including * input, expected test output, * and the constructed runs and other intermediate data. * * theText, testLen, paragraphDirection are input. * theRuns and paragraphEmbeddingLevel are calculated. * expEmbeddingLevel, expOrder are parsed * from the testcase data and checked against * calculated values. * For simplicity, the expected levels data parsed * from the test case are stored with theText. * * Starting from Version 2.0, theText pointer is * allocated statically, simply pointing to a static * buffer of BR_MAXINPUTLEN length, to cut down on * repeated dynamic allocations during long testcase * runs. */ typedef struct { ALGORITHM_STATE state; /* track state */ int dirtyBit; /* used for debug output control */ int testId; /* used for tagging trace output */ Paragraph_Direction paragraphDirection; /* input */ int paragraphEmbeddingLevel; /* calculated */ int textLen; /* input */ BIDIUNITPTR theText; /* input */ BIDIRUNPTR theRuns; /* calculated */ BIDIRUNPTR lastRun; /* for list appending */ IRSEQPTR theSequences; /* calculated: UBA63 only */ IRSEQPTR lastSequence; /* for list appending */ int expEmbeddingLevel; /* expected test result */ char *expOrder; /* expected test result */ } UBACONTEXT; typedef UBACONTEXT *UBACTXTPTR; /* * BIDI_RULE_TYPE * * An enumerated type which facilitates lookup of * rule callbacks and other data by rule type. */ typedef enum { UBA_RULE_W1, UBA_RULE_W2, UBA_RULE_W3, UBA_RULE_W4, UBA_RULE_W5, UBA_RULE_W6, UBA_RULE_W7, UBA_RULE_N0, UBA_RULE_N1, UBA_RULE_N2, UBA_RULE_Last } BIDI_RULE_TYPE; /* * RULE_CONTEXT * * A struct used to package up parameter information * for a class of rules, to make parameter passing * and function prototype neater. */ typedef struct { BIDIUNITPTR *textChain; int len; int level; BIDIPROP sot; BIDIPROP eot; } BIDI_RULE_CONTEXT; typedef BIDI_RULE_CONTEXT *BIDIRULECTXTPTR; /* * BIDIRUN_RULE_FPTR * * Function type of a UBA rule which operates on * a single text chain. * * This type is used to better abstract the bidi rule * dispatch mechanism, to distinguish version-specific * operations to extract level runs from the logic * which then applies to that run. */ typedef int (*BIDIRUN_RULE_FPTR)( BIDIRULECTXTPTR brp ); typedef struct { BIDIRUN_RULE_FPTR ruleMethod; /* actual function callback for rule */ char *ruleLabel; /* printable label for the function */ char *ruleNumber; /* W1, W2, ... */ char *ruleError; } RULE_TBL_ELEMENT; typedef RULE_TBL_ELEMENT *RTEPTR; /* * Private Function Declarations */ /* * Exported from libbidir library */ extern int Trace ( U_Int_32 traceValue ); extern void TraceOffAll ( void ); extern int GetFileFormat ( void ); extern void SetFileFormat ( int format ); extern int GetUBAVersion ( void ); extern void SetUBAVersion ( int version ); extern int convertHexToInt ( U_Int_32 *dest, char *source ); extern int br_Evaluate ( char *data, int *outval ); extern char *copyField ( char *dest, char *src ); extern char *copySubField ( char *dest, char *src ); extern void br_ErrPrint ( char *errString ); extern int br_GetBCFromLabel ( char* src ); extern BIDIPROP br_GetBC ( U_Int_32 c ); extern BPTPROP br_GetBPT ( U_Int_32 c ); extern U_Int_32 br_GetBPB ( U_Int_32 c ); extern int br_InitTable ( int version ); extern int br_UBA ( UBACTXTPTR ctxt ); extern int br_Check ( UBACTXTPTR ctxt ); /* * Exported from brinput.c for use in bidiref.c */ extern int br_RunStaticTest ( void ); extern int br_RunFileTests ( int version, char *input ); /* * Exported from brtable.c for use in brtest.c */ extern int br_GetInitDone ( void ); #ifdef __cplusplus } #endif #endif /* __BIDIREFP_LOADED */ .